[llvm] f5c62ee - [PHIElimination] Reuse existing COPY in predecessor basic block (#131837)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 29 11:29:03 PDT 2025
Author: Guy David
Date: 2025-06-29T21:28:42+03:00
New Revision: f5c62ee0fa0466382cb11f6fad80d323b0fca057
URL: https://github.com/llvm/llvm-project/commit/f5c62ee0fa0466382cb11f6fad80d323b0fca057
DIFF: https://github.com/llvm/llvm-project/commit/f5c62ee0fa0466382cb11f6fad80d323b0fca057.diff
LOG: [PHIElimination] Reuse existing COPY in predecessor basic block (#131837)
The insertion point of COPY isn't always optimal and could eventually
lead to a worse block layout, see the regression test in the first
commit.
This change affects many architectures but the amount of total
instructions in the test cases seems too be slightly lower.
Added:
llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
llvm/test/CodeGen/AArch64/block-layout-regression.mir
Modified:
llvm/lib/CodeGen/PHIElimination.cpp
llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
llvm/test/CodeGen/AArch64/bfis-in-loop.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
llvm/test/CodeGen/AArch64/phi.ll
llvm/test/CodeGen/AArch64/pr48188.ll
llvm/test/CodeGen/AArch64/ragreedy-csr.ll
llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
llvm/test/CodeGen/AArch64/reduce-or-opt.ll
llvm/test/CodeGen/AArch64/sink-and-fold.ll
llvm/test/CodeGen/AArch64/sve-lsrchain.ll
llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
llvm/test/CodeGen/AArch64/swifterror.ll
llvm/test/CodeGen/AArch64/tbl-loops.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/div_i128.ll
llvm/test/CodeGen/AMDGPU/div_v2i128.ll
llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
llvm/test/CodeGen/AMDGPU/mul.ll
llvm/test/CodeGen/AMDGPU/rem_i128.ll
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/CodeGen/AMDGPU/udiv64.ll
llvm/test/CodeGen/AMDGPU/urem64.ll
llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
llvm/test/CodeGen/AMDGPU/wave32.ll
llvm/test/CodeGen/ARM/and-cmp0-sink.ll
llvm/test/CodeGen/ARM/cttz.ll
llvm/test/CodeGen/ARM/select-imm.ll
llvm/test/CodeGen/ARM/struct-byval-loop.ll
llvm/test/CodeGen/ARM/swifterror.ll
llvm/test/CodeGen/AVR/bug-81911.ll
llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
llvm/test/CodeGen/Hexagon/swp-stages4.ll
llvm/test/CodeGen/Hexagon/tinycore.ll
llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
llvm/test/CodeGen/PowerPC/phi-eliminate.mir
llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
llvm/test/CodeGen/PowerPC/pr116071.ll
llvm/test/CodeGen/PowerPC/sms-phi-2.ll
llvm/test/CodeGen/PowerPC/sms-phi-3.ll
llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
llvm/test/CodeGen/PowerPC/vsx.ll
llvm/test/CodeGen/RISCV/abds.ll
llvm/test/CodeGen/RISCV/machine-pipeliner.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
llvm/test/CodeGen/RISCV/xcvbi.ll
llvm/test/CodeGen/SystemZ/swifterror.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
llvm/test/CodeGen/Thumb2/mve-phireg.ll
llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
llvm/test/CodeGen/Thumb2/pr52817.ll
llvm/test/CodeGen/VE/Scalar/br_jt.ll
llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
llvm/test/CodeGen/X86/atomic32.ll
llvm/test/CodeGen/X86/atomic64.ll
llvm/test/CodeGen/X86/atomic6432.ll
llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
llvm/test/CodeGen/X86/callbr-asm-kill.mir
llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
llvm/test/CodeGen/X86/combine-pmuldq.ll
llvm/test/CodeGen/X86/fp128-select.ll
llvm/test/CodeGen/X86/madd.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/pcsections-atomics.ll
llvm/test/CodeGen/X86/pr15705.ll
llvm/test/CodeGen/X86/pr32256.ll
llvm/test/CodeGen/X86/pr38795.ll
llvm/test/CodeGen/X86/pr49451.ll
llvm/test/CodeGen/X86/pr63108.ll
llvm/test/CodeGen/X86/sad.ll
llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
llvm/test/CodeGen/X86/swifterror.ll
llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index a93a89ecaa96e..959a1711727a4 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -581,6 +581,15 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB,
continue;
}
+ // Reuse an existing copy in the block if possible.
+ if (MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg)) {
+ if (DefMI->isCopy() && DefMI->getParent() == &opBlock &&
+ MRI->use_empty(SrcReg)) {
+ DefMI->getOperand(0).setReg(IncomingReg);
+ continue;
+ }
+ }
+
// Find a safe location to insert the copy, this may be the first terminator
// in the block (or end()).
MachineBasicBlock::iterator InsertPos =
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
index c1c5c53aa7df2..6c300b04508b2 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
@@ -118,8 +118,8 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_unordered:
; -O0: bl __aarch64_cas16_relax
-; -O0: subs x10, x10, x11
-; -O0: ccmp x8, x9, #0, eq
+; -O0: subs x9, x0, x9
+; -O0: ccmp x1, x8, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_unordered:
; -O1: ldxp xzr, x8, [x2]
@@ -131,8 +131,8 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_monotonic:
; -O0: bl __aarch64_cas16_relax
-; -O0: subs x10, x10, x11
-; -O0: ccmp x8, x9, #0, eq
+; -O0: subs x9, x0, x9
+; -O0: ccmp x1, x8, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_monotonic:
; -O1: ldxp xzr, x8, [x2]
@@ -144,8 +144,8 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_release:
; -O0: bl __aarch64_cas16_rel
-; -O0: subs x10, x10, x11
-; -O0: ccmp x8, x9, #0, eq
+; -O0: subs x9, x0, x9
+; -O0: ccmp x1, x8, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_release:
; -O1: ldxp xzr, x8, [x2]
@@ -157,8 +157,8 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
; -O0: bl __aarch64_cas16_acq_rel
-; -O0: subs x10, x10, x11
-; -O0: ccmp x8, x9, #0, eq
+; -O0: subs x9, x0, x9
+; -O0: ccmp x1, x8, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
; -O1: ldaxp xzr, x8, [x2]
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
index d1047d84e2956..2a7bbad9d6454 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
@@ -117,13 +117,13 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0: ldxp x10, x12, [x9]
+; -O0: ldxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stxp w8, x14, x15, [x9]
-; -O0: stxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stxp w12, x14, x15, [x13]
+; -O0: stxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_unordered:
; -O1: ldxp xzr, x8, [x2]
@@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0: ldxp x10, x12, [x9]
+; -O0: ldxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stxp w8, x14, x15, [x9]
-; -O0: stxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stxp w12, x14, x15, [x13]
+; -O0: stxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_monotonic:
; -O1: ldxp xzr, x8, [x2]
@@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0: ldxp x10, x12, [x9]
+; -O0: ldxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stlxp w8, x14, x15, [x9]
-; -O0: stlxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stlxp w12, x14, x15, [x13]
+; -O0: stlxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_release:
; -O1: ldxp xzr, x8, [x2]
@@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0: ldaxp x10, x12, [x9]
+; -O0: ldaxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stlxp w8, x14, x15, [x9]
-; -O0: stlxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stlxp w12, x14, x15, [x13]
+; -O0: stlxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
; -O1: ldaxp xzr, x8, [x2]
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
index 1a79c73355143..493bc742f7663 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
@@ -117,13 +117,13 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0: ldxp x10, x12, [x9]
+; -O0: ldxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stxp w8, x14, x15, [x9]
-; -O0: stxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stxp w12, x14, x15, [x13]
+; -O0: stxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_unordered:
; -O1: ldxp xzr, x8, [x2]
@@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0: ldxp x10, x12, [x9]
+; -O0: ldxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stxp w8, x14, x15, [x9]
-; -O0: stxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stxp w12, x14, x15, [x13]
+; -O0: stxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_monotonic:
; -O1: ldxp xzr, x8, [x2]
@@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0: ldxp x10, x12, [x9]
+; -O0: ldxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stlxp w8, x14, x15, [x9]
-; -O0: stlxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stlxp w12, x14, x15, [x13]
+; -O0: stlxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_release:
; -O1: ldxp xzr, x8, [x2]
@@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0: ldaxp x10, x12, [x9]
+; -O0: ldaxp x8, x10, [x13]
+; -O0: cmp x8, x9
; -O0: cmp x10, x11
-; -O0: cmp x12, x13
-; -O0: stlxp w8, x14, x15, [x9]
-; -O0: stlxp w8, x10, x12, [x9]
-; -O0: subs x12, x12, x13
-; -O0: ccmp x10, x11, #0, eq
+; -O0: stlxp w12, x14, x15, [x13]
+; -O0: stlxp w12, x8, x10, [x13]
+; -O0: subs x10, x10, x11
+; -O0: ccmp x8, x9, #0, eq
;
; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
; -O1: ldaxp xzr, x8, [x2]
diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
index 01c44e3f253bb..993d1c1f1b5f0 100644
--- a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
+++ b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
@@ -37,7 +37,7 @@ body: |
bb.1:
%x:gpr32 = COPY $wzr
; Test that the debug location is not copied into bb1!
- ; CHECK: %3:gpr32 = COPY killed %x{{$}}
+ ; CHECK: %3:gpr32 = COPY $wzr
; CHECK-LABEL: bb.2:
bb.2:
%y:gpr32 = PHI %x:gpr32, %bb.1, undef %undef:gpr32, %bb.0, debug-location !14
diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
new file mode 100644
index 0000000000000..75283d9a7b14a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
@@ -0,0 +1,68 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=phi-node-elimination -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+# Verify that the original COPY in bb.1 is reappropriated as the PHI source in bb.2,
+# instead of creating a new COPY with the same source register.
+
+---
+name: copy_virtual_reg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: copy_virtual_reg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $nzcv, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %a:gpr32 = COPY $w0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
+ ; CHECK-NEXT: Bcc 8, %bb.2, implicit $nzcv
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = COPY %a
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: %c:gpr32 = COPY [[DEF]]
+ ; CHECK-NEXT: %d:gpr32 = COPY %c
+ bb.0:
+ liveins: $nzcv, $w0
+ %a:gpr32 = COPY $w0
+ Bcc 8, %bb.2, implicit $nzcv
+ bb.1:
+ %b:gpr32 = COPY %a:gpr32
+ bb.2:
+ %c:gpr32 = PHI %b:gpr32, %bb.1, undef %undef:gpr32, %bb.0
+ %d:gpr32 = COPY %c:gpr32
+...
+
+---
+name: copy_physical_reg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: copy_physical_reg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $nzcv, $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
+ ; CHECK-NEXT: Bcc 8, %bb.2, implicit $nzcv
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $x0 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr32 = COPY $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: %b:gpr32 = COPY [[DEF]]
+ bb.0:
+ liveins: $nzcv, $w0
+ Bcc 8, %bb.2, implicit $nzcv
+ bb.1:
+ $x0 = IMPLICIT_DEF
+ %a:gpr32 = COPY $w0
+ bb.2:
+ %b:gpr32 = PHI %a:gpr32, %bb.1, undef %undef:gpr32, %bb.0
+...
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 8655bb1292ef7..ca1052a769408 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -583,8 +583,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: mov w10, w2
; CHECK-SD-NEXT: b.hi .LBB5_4
; CHECK-SD-NEXT: // %bb.2:
-; CHECK-SD-NEXT: mov x11, xzr
; CHECK-SD-NEXT: mov w8, wzr
+; CHECK-SD-NEXT: mov x11, xzr
; CHECK-SD-NEXT: b .LBB5_7
; CHECK-SD-NEXT: .LBB5_3:
; CHECK-SD-NEXT: mov w8, wzr
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
index 71e0250b36972..9fd27edae3176 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -199,16 +199,16 @@ define i128 @test_rmw_add_128(ptr %dst) {
; NOLSE-NEXT: sub sp, sp, #48
; NOLSE-NEXT: .cfi_def_cfa_offset 48
; NOLSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill
-; NOLSE-NEXT: ldr x8, [x0, #8]
-; NOLSE-NEXT: ldr x9, [x0]
+; NOLSE-NEXT: ldr x9, [x0, #8]
+; NOLSE-NEXT: ldr x8, [x0]
; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: b .LBB4_1
; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB4_2 Depth 2
-; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
; NOLSE-NEXT: adds x14, x11, #1
; NOLSE-NEXT: cinc x15, x13, hs
@@ -238,8 +238,8 @@ define i128 @test_rmw_add_128(ptr %dst) {
; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; NOLSE-NEXT: subs x12, x12, x13
; NOLSE-NEXT: ccmp x10, x11, #0, eq
-; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
-; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: b.ne .LBB4_1
; NOLSE-NEXT: b .LBB4_6
; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end
@@ -253,15 +253,15 @@ define i128 @test_rmw_add_128(ptr %dst) {
; LSE-NEXT: sub sp, sp, #48
; LSE-NEXT: .cfi_def_cfa_offset 48
; LSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill
-; LSE-NEXT: ldr x8, [x0, #8]
-; LSE-NEXT: ldr x9, [x0]
+; LSE-NEXT: ldr x9, [x0, #8]
+; LSE-NEXT: ldr x8, [x0]
; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
; LSE-NEXT: b .LBB4_1
; LSE-NEXT: .LBB4_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload
; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload
; LSE-NEXT: mov x0, x10
; LSE-NEXT: mov x1, x11
@@ -276,8 +276,8 @@ define i128 @test_rmw_add_128(ptr %dst) {
; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
; LSE-NEXT: subs x11, x8, x11
; LSE-NEXT: ccmp x9, x10, #0, eq
-; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
-; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: b.ne .LBB4_1
; LSE-NEXT: b .LBB4_2
; LSE-NEXT: .LBB4_2: // %atomicrmw.end
@@ -573,16 +573,16 @@ define i128 @test_rmw_nand_128(ptr %dst) {
; NOLSE-NEXT: sub sp, sp, #48
; NOLSE-NEXT: .cfi_def_cfa_offset 48
; NOLSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill
-; NOLSE-NEXT: ldr x8, [x0, #8]
-; NOLSE-NEXT: ldr x9, [x0]
+; NOLSE-NEXT: ldr x9, [x0, #8]
+; NOLSE-NEXT: ldr x8, [x0]
; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: b .LBB9_1
; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB9_2 Depth 2
-; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
; NOLSE-NEXT: mov w8, w11
; NOLSE-NEXT: mvn w10, w8
@@ -616,8 +616,8 @@ define i128 @test_rmw_nand_128(ptr %dst) {
; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; NOLSE-NEXT: subs x12, x12, x13
; NOLSE-NEXT: ccmp x10, x11, #0, eq
-; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
-; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: b.ne .LBB9_1
; NOLSE-NEXT: b .LBB9_6
; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end
@@ -631,15 +631,15 @@ define i128 @test_rmw_nand_128(ptr %dst) {
; LSE-NEXT: sub sp, sp, #48
; LSE-NEXT: .cfi_def_cfa_offset 48
; LSE-NEXT: str x0, [sp, #24] // 8-byte Folded Spill
-; LSE-NEXT: ldr x8, [x0, #8]
-; LSE-NEXT: ldr x9, [x0]
+; LSE-NEXT: ldr x9, [x0, #8]
+; LSE-NEXT: ldr x8, [x0]
; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
; LSE-NEXT: b .LBB9_1
; LSE-NEXT: .LBB9_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload
; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload
; LSE-NEXT: mov x0, x10
; LSE-NEXT: mov x1, x11
@@ -658,8 +658,8 @@ define i128 @test_rmw_nand_128(ptr %dst) {
; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
; LSE-NEXT: subs x11, x8, x11
; LSE-NEXT: ccmp x9, x10, #0, eq
-; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
-; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT: str x8, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: b.ne .LBB9_1
; LSE-NEXT: b .LBB9_2
; LSE-NEXT: .LBB9_2: // %atomicrmw.end
diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
index 43d49da1abd21..b0339222bc2df 100644
--- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
+++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
@@ -14,8 +14,8 @@ define i64 @bfis_in_loop_zero() {
; CHECK-LABEL: bfis_in_loop_zero:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x9, :got:global
-; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ldr x9, [x9, :got_lo12:global]
; CHECK-NEXT: mov w10, #65536 // =0x10000
; CHECK-NEXT: ldr x9, [x9]
diff --git a/llvm/test/CodeGen/AArch64/block-layout-regression.mir b/llvm/test/CodeGen/AArch64/block-layout-regression.mir
new file mode 100644
index 0000000000000..a7ab5c029b142
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/block-layout-regression.mir
@@ -0,0 +1,85 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s \
+# RUN: -start-before=phi-node-elimination -stop-after=branch-relaxation \
+# RUN: | FileCheck %s
+
+# Verify an optimal block layout is produced for the following nested loop, when
+# there's a PHI node in bb.5 that relies on an operand in bb.2.
+# The previous layout used a CBNZX for a null check followed by a unconditional
+# branch to bb.6, instead of a fallthrough.
+
+# [ bb.0 ENTRY ]
+# |
+# v
+# [ bb.1 ] <-------+
+# / \ |
+# v v |
+# [bb.2] [bb.3] <-+ |
+# | | | |
+# | v | |
+# | [bb.4] --+ |
+# | | |
+# v v |
+# [ bb.5 ] |
+# / \ |
+# | v |
+# | [bb.6] -----+
+# | |
+# v v
+# [ bb.7 RET ]
+
+# CHECK-LABEL: test
+# CHECK-NOT: CBNZX
+# CHECK-NOT: B %bb.
+# CHECK-COUNT-2: CBZX
+
+---
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000); %bb.1(100.00%)
+ liveins: $x0, $w1, $x2, $x3
+ %0:gpr64all = IMPLICIT_DEF
+ %1:gpr64common = IMPLICIT_DEF
+ %2:gpr32common = IMPLICIT_DEF
+ %3:gpr32 = IMPLICIT_DEF
+ B %bb.1
+
+ bb.1:
+ successors: %bb.2(0x30000000), %bb.3(0x50000000); %bb.2(37.50%), %bb.3(62.50%)
+ %4:gpr64common = PHI undef %0:gpr64all, %bb.0, %5:gpr64common, %bb.6
+ %6:gpr64 = LDRXui undef %1:gpr64common, 0 :: (load (s64))
+ STRXui killed %4:gpr64common, undef %1:gpr64common, 0 :: (store (s64))
+ CBNZX undef %6:gpr64, %bb.3
+
+ bb.2:
+ successors: %bb.5(0x80000000); %bb.5(100.00%)
+ %7:gpr64all = COPY killed %6:gpr64
+ B %bb.5
+
+ bb.3:
+ successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%)
+ dead $wzr = SUBSWrr killed undef %3:gpr32, killed undef %2:gpr32common, implicit-def $nzcv
+ Bcc 12, %bb.5, implicit killed undef $nzcv
+ B %bb.4
+
+ bb.4:
+ successors: %bb.5(0x04000000), %bb.3(0x7c000000); %bb.5(3.12%), %bb.3(96.88%)
+ dead $xzr = SUBSXrr killed undef %6:gpr64, killed undef %6:gpr64, implicit-def $nzcv
+ Bcc 1, %bb.3, implicit killed undef $nzcv
+
+ bb.5:
+ successors: %bb.7(0x04000000), %bb.6(0x7c000000); %bb.7(3.12%), %bb.6(96.88%)
+ %5:gpr64common = PHI %7:gpr64all, %bb.2, undef %0:gpr64all, %bb.3, undef %0:gpr64all, %bb.4
+ CBZX undef %5:gpr64common, %bb.7
+ B %bb.6
+
+ bb.6:
+ successors: %bb.7(0x04000000), %bb.1(0x7c000000); %bb.7(3.12%), %bb.1(96.88%)
+ dead $wzr = SUBSWrr killed undef %3:gpr32, killed undef %2:gpr32common, implicit-def $nzcv
+ Bcc 12, %bb.7, implicit killed undef $nzcv
+ B %bb.1
+
+ bb.7:
+ RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 7542e9c4b8f5b..327d0749c7dbf 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -35,10 +35,10 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-LABEL: check_deinterleaving_has_deinterleave:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: add x8, x0, #16
; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: movi v2.2d, #0000000000000000
+; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w9, #32 // =0x20
; CHECK-NEXT: movi v4.2d, #0000000000000000
; CHECK-NEXT: movi v5.2d, #0000000000000000
@@ -64,16 +64,16 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-NEXT: ushll v24.4s, v18.4h, #0
; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0
; CHECK-NEXT: ushll v20.4s, v20.4h, #0
-; CHECK-NEXT: and v21.16b, v21.16b, v1.16b
-; CHECK-NEXT: and v19.16b, v19.16b, v1.16b
-; CHECK-NEXT: and v22.16b, v22.16b, v1.16b
-; CHECK-NEXT: and v17.16b, v17.16b, v1.16b
-; CHECK-NEXT: and v23.16b, v23.16b, v1.16b
-; CHECK-NEXT: and v24.16b, v24.16b, v1.16b
-; CHECK-NEXT: and v18.16b, v18.16b, v1.16b
-; CHECK-NEXT: and v20.16b, v20.16b, v1.16b
+; CHECK-NEXT: and v21.16b, v21.16b, v2.16b
+; CHECK-NEXT: and v19.16b, v19.16b, v2.16b
+; CHECK-NEXT: and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT: and v17.16b, v17.16b, v2.16b
+; CHECK-NEXT: and v23.16b, v23.16b, v2.16b
+; CHECK-NEXT: and v24.16b, v24.16b, v2.16b
+; CHECK-NEXT: and v18.16b, v18.16b, v2.16b
+; CHECK-NEXT: and v20.16b, v20.16b, v2.16b
; CHECK-NEXT: add v4.4s, v4.4s, v19.4s
-; CHECK-NEXT: add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v21.4s
; CHECK-NEXT: add v0.4s, v0.4s, v22.4s
; CHECK-NEXT: add v3.4s, v3.4s, v17.4s
; CHECK-NEXT: add v16.4s, v16.4s, v23.4s
@@ -82,12 +82,12 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-NEXT: add v7.4s, v7.4s, v18.4s
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: add v1.4s, v7.4s, v3.4s
+; CHECK-NEXT: add v2.4s, v7.4s, v3.4s
; CHECK-NEXT: add v3.4s, v16.4s, v4.4s
; CHECK-NEXT: add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT: add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: add v1.4s, v6.4s, v1.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 880bd2904154c..69df51a27d10c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -18,8 +18,8 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: cntd x9
; CHECK-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NEXT: rdvl x10, #2
-; CHECK-NEXT: mov x11, x9
+; CHECK-NEXT: mov x10, x9
+; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
@@ -33,16 +33,16 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
-; CHECK-NEXT: add x1, x1, x10
-; CHECK-NEXT: add x0, x0, x10
+; CHECK-NEXT: add x1, x1, x11
+; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p2/m, z7.d
; CHECK-NEXT: mov z1.d, p1/m, z6.d
-; CHECK-NEXT: whilelo p1.d, x11, x8
-; CHECK-NEXT: add x11, x11, x9
+; CHECK-NEXT: whilelo p1.d, x10, x8
+; CHECK-NEXT: add x10, x10, x9
; CHECK-NEXT: b.mi .LBB0_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -217,10 +217,10 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: cntd x9
; CHECK-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NEXT: rdvl x10, #2
-; CHECK-NEXT: cnth x11
+; CHECK-NEXT: mov x10, x9
+; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov x12, x9
+; CHECK-NEXT: cnth x12
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB2_1: // %vector.body
@@ -228,7 +228,7 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
; CHECK-NEXT: mov z6.d, z1.d
; CHECK-NEXT: mov z7.d, z0.d
-; CHECK-NEXT: add x2, x2, x11
+; CHECK-NEXT: add x2, x2, x12
; CHECK-NEXT: and z2.d, z2.d, #0xffffffff
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
@@ -237,16 +237,16 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
-; CHECK-NEXT: add x1, x1, x10
-; CHECK-NEXT: add x0, x0, x10
+; CHECK-NEXT: add x1, x1, x11
+; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z0.d, p2/m, z7.d
; CHECK-NEXT: mov z1.d, p1/m, z6.d
-; CHECK-NEXT: whilelo p1.d, x12, x8
-; CHECK-NEXT: add x12, x12, x9
+; CHECK-NEXT: whilelo p1.d, x10, x8
+; CHECK-NEXT: add x10, x10, x9
; CHECK-NEXT: b.mi .LBB2_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index aed3072bb4af3..c977869d2ce95 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -25,14 +25,14 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q3, q2, [x9]
; CHECK-NEXT: cmp x8, #1600
; CHECK-NEXT: ldp q5, q4, [x10]
-; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
-; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
-; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
-; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90
+; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
+; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
+; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
+; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: faddp d1, v2.2d
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/phi.ll b/llvm/test/CodeGen/AArch64/phi.ll
index 02842c04bf7bf..446c3beeff976 100644
--- a/llvm/test/CodeGen/AArch64/phi.ll
+++ b/llvm/test/CodeGen/AArch64/phi.ll
@@ -131,8 +131,8 @@ define i128 @ti128(i1 %c, ptr %p, i128 %a, i128 %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB4_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov x4, x2
; CHECK-SD-NEXT: mov x5, x3
+; CHECK-SD-NEXT: mov x4, x2
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB4_2: // %e
; CHECK-SD-NEXT: mov x0, x4
@@ -336,9 +336,9 @@ define <3 x i8> @tv3i8(i1 %c, ptr %p, <3 x i8> %a, <3 x i8> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB11_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov w5, w2
-; CHECK-SD-NEXT: mov w6, w3
; CHECK-SD-NEXT: mov w7, w4
+; CHECK-SD-NEXT: mov w6, w3
+; CHECK-SD-NEXT: mov w5, w2
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB11_2: // %e
; CHECK-SD-NEXT: mov w0, w5
@@ -454,8 +454,8 @@ define <32 x i8> @tv32i8(i1 %c, ptr %p, <32 x i8> %a, <32 x i8> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB15_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB15_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -584,8 +584,8 @@ define <16 x i16> @tv16i16(i1 %c, ptr %p, <16 x i16> %a, <16 x i16> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB20_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB20_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -679,8 +679,8 @@ define <8 x i32> @tv8i32(i1 %c, ptr %p, <8 x i32> %a, <8 x i32> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB24_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB24_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -734,10 +734,10 @@ define <3 x i64> @tv3i64(i1 %c, ptr %p, <3 x i64> %a, <3 x i64> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB26_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: fmov d3, d0
+; CHECK-SD-NEXT: fmov d5, d2
; CHECK-SD-NEXT: fmov d4, d1
; CHECK-SD-NEXT: str wzr, [x1]
-; CHECK-SD-NEXT: fmov d5, d2
+; CHECK-SD-NEXT: fmov d3, d0
; CHECK-SD-NEXT: .LBB26_2: // %e
; CHECK-SD-NEXT: fmov d0, d3
; CHECK-SD-NEXT: fmov d1, d4
@@ -783,8 +783,8 @@ define <4 x i64> @tv4i64(i1 %c, ptr %p, <4 x i64> %a, <4 x i64> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB27_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB27_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -818,8 +818,8 @@ define <2 x i128> @tv2i128(i1 %c, ptr %p, <2 x i128> %a, <2 x i128> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB28_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov x6, x2
; CHECK-SD-NEXT: mov x7, x3
+; CHECK-SD-NEXT: mov x6, x2
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: b .LBB28_3
; CHECK-SD-NEXT: .LBB28_2:
@@ -883,10 +883,10 @@ define <3 x ptr> @tv3p0(i1 %c, ptr %p, <3 x ptr> %a, <3 x ptr> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB30_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: fmov d3, d0
+; CHECK-SD-NEXT: fmov d5, d2
; CHECK-SD-NEXT: fmov d4, d1
; CHECK-SD-NEXT: str wzr, [x1]
-; CHECK-SD-NEXT: fmov d5, d2
+; CHECK-SD-NEXT: fmov d3, d0
; CHECK-SD-NEXT: .LBB30_2: // %e
; CHECK-SD-NEXT: fmov d0, d3
; CHECK-SD-NEXT: fmov d1, d4
@@ -932,8 +932,8 @@ define <4 x ptr> @tv4p0(i1 %c, ptr %p, <4 x ptr> %a, <4 x ptr> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB31_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB31_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -1047,8 +1047,8 @@ define <16 x half> @tv16f16(i1 %c, ptr %p, <16 x half> %a, <16 x half> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB36_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB36_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -1142,8 +1142,8 @@ define <8 x float> @tv8f32(i1 %c, ptr %p, <8 x float> %a, <8 x float> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB40_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB40_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -1197,10 +1197,10 @@ define <3 x double> @tv3f64(i1 %c, ptr %p, <3 x double> %a, <3 x double> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB42_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: fmov d3, d0
+; CHECK-SD-NEXT: fmov d5, d2
; CHECK-SD-NEXT: fmov d4, d1
; CHECK-SD-NEXT: str wzr, [x1]
-; CHECK-SD-NEXT: fmov d5, d2
+; CHECK-SD-NEXT: fmov d3, d0
; CHECK-SD-NEXT: .LBB42_2: // %e
; CHECK-SD-NEXT: fmov d0, d3
; CHECK-SD-NEXT: fmov d1, d4
@@ -1246,8 +1246,8 @@ define <4 x double> @tv4f64(i1 %c, ptr %p, <4 x double> %a, <4 x double> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB43_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB43_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -1281,8 +1281,8 @@ define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: tbz w0, #0, .LBB44_2
; CHECK-SD-NEXT: // %bb.1: // %t
-; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: mov v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v2.16b, v0.16b
; CHECK-SD-NEXT: str wzr, [x1]
; CHECK-SD-NEXT: .LBB44_2: // %e
; CHECK-SD-NEXT: mov v0.16b, v2.16b
@@ -1296,8 +1296,8 @@ define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) {
; CHECK-GI-NEXT: mov d4, v1.d[1]
; CHECK-GI-NEXT: mov d5, v0.d[1]
; CHECK-GI-NEXT: str wzr, [x1]
-; CHECK-GI-NEXT: fmov d2, d0
; CHECK-GI-NEXT: fmov d3, d1
+; CHECK-GI-NEXT: fmov d2, d0
; CHECK-GI-NEXT: b .LBB44_3
; CHECK-GI-NEXT: .LBB44_2:
; CHECK-GI-NEXT: mov d4, v3.d[1]
diff --git a/llvm/test/CodeGen/AArch64/pr48188.ll b/llvm/test/CodeGen/AArch64/pr48188.ll
index d01069696572e..634517b099f98 100644
--- a/llvm/test/CodeGen/AArch64/pr48188.ll
+++ b/llvm/test/CodeGen/AArch64/pr48188.ll
@@ -14,17 +14,17 @@ define void @test() nounwind {
; GISEL-NEXT: b .LBB0_1
; GISEL-NEXT: .LBB0_1: // %loop
; GISEL-NEXT: // =>This Inner Loop Header: Depth=1
-; GISEL-NEXT: ldr x8, [sp, #8] // 8-byte Folded Reload
-; GISEL-NEXT: ldr x9, [sp] // 8-byte Folded Reload
-; GISEL-NEXT: str x9, [sp] // 8-byte Folded Spill
-; GISEL-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
+; GISEL-NEXT: ldr x8, [sp] // 8-byte Folded Reload
+; GISEL-NEXT: ldr x9, [sp, #8] // 8-byte Folded Reload
+; GISEL-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; GISEL-NEXT: str x8, [sp] // 8-byte Folded Spill
; GISEL-NEXT: b .LBB0_1
;
; SDAG-LABEL: test:
; SDAG: // %bb.0: // %entry
; SDAG-NEXT: sub sp, sp, #16
-; SDAG-NEXT: mov x1, xzr
-; SDAG-NEXT: mov x0, x1
+; SDAG-NEXT: mov x0, xzr
+; SDAG-NEXT: mov x1, x0
; SDAG-NEXT: str x1, [sp] // 8-byte Folded Spill
; SDAG-NEXT: str x0, [sp, #8] // 8-byte Folded Spill
; SDAG-NEXT: b .LBB0_1
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
index 5b501762418ef..921cadc7a7511 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -211,27 +211,27 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: cmp w12, #2
; CHECK-NEXT: b.ne LBB0_43
; CHECK-NEXT: ; %bb.35: ; %while.cond130.preheader
-; CHECK-NEXT: ldrb w8, [x9, x11]
-; CHECK-NEXT: cbz w8, LBB0_23
+; CHECK-NEXT: ldrb w12, [x9, x11]
+; CHECK-NEXT: cbz w12, LBB0_23
; CHECK-NEXT: ; %bb.36: ; %land.rhs134.preheader
-; CHECK-NEXT: mov x12, xzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: b LBB0_38
; CHECK-NEXT: LBB0_37: ; %if.then152
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT: add x8, x9, x12
-; CHECK-NEXT: add x12, x12, #1
-; CHECK-NEXT: add x8, x8, x11
-; CHECK-NEXT: ldrb w8, [x8, #1]
-; CHECK-NEXT: cbz w8, LBB0_43
+; CHECK-NEXT: add x12, x9, x8
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: add x12, x12, x11
+; CHECK-NEXT: ldrb w12, [x12, #1]
+; CHECK-NEXT: cbz w12, LBB0_43
; CHECK-NEXT: LBB0_38: ; %land.rhs134
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x13, x10, x12
+; CHECK-NEXT: add x13, x10, x8
; CHECK-NEXT: ldrb w13, [x13, x11]
; CHECK-NEXT: cbz w13, LBB0_23
; CHECK-NEXT: ; %bb.39: ; %while.body139
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT: cmp w8, w13
+; CHECK-NEXT: cmp w12, w13
; CHECK-NEXT: b.eq LBB0_37
; CHECK-NEXT: ; %bb.40: ; %while.body139
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
@@ -239,7 +239,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
; CHECK-NEXT: b.eq LBB0_37
; CHECK-NEXT: ; %bb.41: ; %while.body139
; CHECK-NEXT: ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT: cmp w8, #94
+; CHECK-NEXT: cmp w12, #94
; CHECK-NEXT: b.eq LBB0_37
; CHECK-NEXT: LBB0_42:
; CHECK-NEXT: mov w0, wzr
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index c91de8f3a0a47..8f1c504a7f684 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -31,14 +31,12 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: .cfi_offset b14, -104
; CHECK-NEXT: .cfi_offset b15, -112
; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: adrp x9, B+48
; CHECK-NEXT: add x9, x9, :lo12:B+48
; CHECK-NEXT: adrp x10, A
; CHECK-NEXT: add x10, x10, :lo12:A
; CHECK-NEXT: mov x11, xzr
-; CHECK-NEXT: // kill: killed $q1
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: mov x12, xzr
; CHECK-NEXT: // implicit-def: $q0
@@ -56,7 +54,7 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: // implicit-def: $q22
; CHECK-NEXT: // implicit-def: $q23
; CHECK-NEXT: // implicit-def: $q24
-; CHECK-NEXT: // implicit-def: $q9
+; CHECK-NEXT: // implicit-def: $q25
; CHECK-NEXT: // implicit-def: $q27
; CHECK-NEXT: // implicit-def: $q12
; CHECK-NEXT: // implicit-def: $q28
@@ -66,95 +64,97 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: // implicit-def: $q30
; CHECK-NEXT: // implicit-def: $q11
; CHECK-NEXT: // implicit-def: $q31
-; CHECK-NEXT: // implicit-def: $q13
; CHECK-NEXT: // kill: killed $q1
+; CHECK-NEXT: // implicit-def: $q9
+; CHECK-NEXT: // implicit-def: $q13
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: // kill: killed $q1
; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill
-; CHECK-NEXT: ldr q15, [x8]
; CHECK-NEXT: ldr x15, [x8]
-; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: add x20, x10, x11
-; CHECK-NEXT: mov v8.16b, v28.16b
-; CHECK-NEXT: fmov x2, d15
-; CHECK-NEXT: mov x17, v15.d[1]
+; CHECK-NEXT: ldr x6, [x8]
+; CHECK-NEXT: ldr x20, [x20, #128]
+; CHECK-NEXT: stp q25, q29, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: mov v29.16b, v21.16b
+; CHECK-NEXT: stp q15, q14, [sp, #32] // 32-byte Folded Spill
; CHECK-NEXT: ldr q14, [x8]
+; CHECK-NEXT: mov v21.16b, v0.16b
+; CHECK-NEXT: ldr q15, [x8]
+; CHECK-NEXT: mov v8.16b, v28.16b
; CHECK-NEXT: mov v28.16b, v24.16b
-; CHECK-NEXT: mov v24.16b, v20.16b
-; CHECK-NEXT: mov v20.16b, v17.16b
; CHECK-NEXT: fmov x13, d14
; CHECK-NEXT: mov x16, v14.d[1]
-; CHECK-NEXT: mov v17.16b, v5.16b
-; CHECK-NEXT: mul x3, x2, x15
+; CHECK-NEXT: mov v24.16b, v20.16b
+; CHECK-NEXT: fmov x2, d15
+; CHECK-NEXT: mov x17, v15.d[1]
+; CHECK-NEXT: mov v20.16b, v17.16b
; CHECK-NEXT: ldr q14, [x9], #64
+; CHECK-NEXT: mov v17.16b, v5.16b
+; CHECK-NEXT: mul x18, x13, x15
; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x6, [x8]
-; CHECK-NEXT: ldr x20, [x20, #128]
-; CHECK-NEXT: mul x1, x17, x15
+; CHECK-NEXT: mov v25.16b, v6.16b
; CHECK-NEXT: mov x14, v14.d[1]
; CHECK-NEXT: fmov x5, d14
-; CHECK-NEXT: mov v29.16b, v21.16b
-; CHECK-NEXT: mov v21.16b, v0.16b
-; CHECK-NEXT: mov v25.16b, v6.16b
-; CHECK-NEXT: mul x18, x13, x15
; CHECK-NEXT: mov v6.16b, v2.16b
+; CHECK-NEXT: mul x3, x2, x15
; CHECK-NEXT: mov v26.16b, v22.16b
-; CHECK-NEXT: fmov d15, x3
; CHECK-NEXT: mov v22.16b, v18.16b
; CHECK-NEXT: mov v18.16b, v7.16b
-; CHECK-NEXT: mul x0, x16, x15
; CHECK-NEXT: mov v7.16b, v3.16b
; CHECK-NEXT: mov v16.16b, v4.16b
+; CHECK-NEXT: mul x0, x16, x15
; CHECK-NEXT: add x11, x11, #8
; CHECK-NEXT: add x12, x12, #1
-; CHECK-NEXT: mov v15.d[1], x1
-; CHECK-NEXT: mul x4, x14, x15
-; CHECK-NEXT: cmp x11, #64
; CHECK-NEXT: fmov d14, x18
+; CHECK-NEXT: cmp x11, #64
+; CHECK-NEXT: mul x1, x17, x15
+; CHECK-NEXT: fmov d15, x3
+; CHECK-NEXT: mul x4, x14, x15
+; CHECK-NEXT: mov v14.d[1], x0
; CHECK-NEXT: mul x15, x5, x15
-; CHECK-NEXT: add v5.2d, v5.2d, v15.2d
+; CHECK-NEXT: mov v15.d[1], x1
; CHECK-NEXT: mul x21, x2, x6
-; CHECK-NEXT: mov v14.d[1], x0
+; CHECK-NEXT: add v5.2d, v5.2d, v14.2d
+; CHECK-NEXT: add v9.2d, v9.2d, v14.2d
; CHECK-NEXT: mul x2, x2, x20
; CHECK-NEXT: fmov d0, x15
-; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: add v13.2d, v13.2d, v15.2d
; CHECK-NEXT: mul x22, x13, x20
-; CHECK-NEXT: add v5.2d, v5.2d, v14.2d
+; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: fmov d3, x21
+; CHECK-NEXT: ldp q15, q14, [sp, #32] // 32-byte Folded Reload
; CHECK-NEXT: mul x19, x17, x6
; CHECK-NEXT: mov v0.d[1], x4
+; CHECK-NEXT: mov v5.16b, v13.16b
; CHECK-NEXT: fmov d1, x2
+; CHECK-NEXT: mov v13.16b, v9.16b
+; CHECK-NEXT: ldr q9, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: mul x17, x17, x20
-; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: add v5.2d, v13.2d, v14.2d
; CHECK-NEXT: fmov d2, x22
-; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: mul x7, x16, x6
-; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload
+; CHECK-NEXT: add v9.2d, v9.2d, v0.2d
; CHECK-NEXT: mov v3.d[1], x19
-; CHECK-NEXT: add v13.2d, v13.2d, v0.2d
; CHECK-NEXT: mul x16, x16, x20
; CHECK-NEXT: mov v1.d[1], x17
+; CHECK-NEXT: str q9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: mov v9.16b, v13.16b
; CHECK-NEXT: mul x23, x5, x20
-; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: mov v13.16b, v5.16b
; CHECK-NEXT: mov v5.16b, v17.16b
; CHECK-NEXT: mov v17.16b, v20.16b
; CHECK-NEXT: mov v20.16b, v24.16b
-; CHECK-NEXT: mul x13, x13, x6
; CHECK-NEXT: mov v24.16b, v28.16b
+; CHECK-NEXT: mul x13, x13, x6
; CHECK-NEXT: add v11.2d, v11.2d, v3.2d
+; CHECK-NEXT: add v27.2d, v27.2d, v3.2d
; CHECK-NEXT: mov v2.d[1], x16
; CHECK-NEXT: add v15.2d, v15.2d, v1.2d
-; CHECK-NEXT: add v27.2d, v27.2d, v3.2d
-; CHECK-NEXT: mul x18, x14, x20
; CHECK-NEXT: add v23.2d, v23.2d, v3.2d
+; CHECK-NEXT: mul x18, x14, x20
; CHECK-NEXT: add v19.2d, v19.2d, v3.2d
-; CHECK-NEXT: fmov d4, x23
; CHECK-NEXT: add v10.2d, v10.2d, v3.2d
+; CHECK-NEXT: fmov d4, x23
; CHECK-NEXT: mul x15, x5, x6
; CHECK-NEXT: fmov d0, x13
; CHECK-NEXT: add v14.2d, v14.2d, v2.2d
@@ -164,6 +164,7 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: mov v7.16b, v18.16b
; CHECK-NEXT: mov v4.d[1], x18
; CHECK-NEXT: mov v18.16b, v22.16b
+; CHECK-NEXT: mov v6.16b, v25.16b
; CHECK-NEXT: mov v0.d[1], x7
; CHECK-NEXT: fmov d1, x15
; CHECK-NEXT: add v28.2d, v8.2d, v4.2d
@@ -181,38 +182,36 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-NEXT: add v3.2d, v3.2d, v0.2d
; CHECK-NEXT: mov v0.16b, v21.16b
; CHECK-NEXT: mov v21.16b, v29.16b
-; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: add v9.2d, v9.2d, v1.2d
-; CHECK-NEXT: add v6.2d, v25.2d, v1.2d
+; CHECK-NEXT: add v6.2d, v6.2d, v1.2d
+; CHECK-NEXT: ldp q25, q29, [sp] // 32-byte Folded Reload
; CHECK-NEXT: add v5.2d, v5.2d, v1.2d
-; CHECK-NEXT: add v29.2d, v29.2d, v1.2d
; CHECK-NEXT: add v21.2d, v21.2d, v1.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: add v29.2d, v29.2d, v1.2d
+; CHECK-NEXT: add v25.2d, v25.2d, v1.2d
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: adrp x8, C
; CHECK-NEXT: add x8, x8, :lo12:C
-; CHECK-NEXT: stp q11, q30, [x8, #80]
+; CHECK-NEXT: stp q31, q11, [x8, #64]
; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT: str q1, [x8]
-; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: stp q1, q13, [x8]
+; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr x23, [sp, #160] // 8-byte Folded Reload
-; CHECK-NEXT: stp q15, q14, [x8, #144]
+; CHECK-NEXT: stp q30, q29, [x8, #96]
; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: stp q1, q13, [x8, #16]
-; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: stp q9, q1, [x8, #32]
+; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload
+; CHECK-NEXT: stp q15, q14, [x8, #144]
+; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: stp q28, q12, [x8, #176]
; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT: stp q1, q31, [x8, #48]
-; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT: stp q9, q24, [x8, #240]
-; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: stp q19, q18, [x8, #336]
; CHECK-NEXT: stp q10, q7, [x8, #400]
; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT: str q29, [x8, #112]
; CHECK-NEXT: str q27, [x8, #208]
+; CHECK-NEXT: stp q25, q24, [x8, #240]
; CHECK-NEXT: stp q23, q22, [x8, #272]
; CHECK-NEXT: stp q21, q20, [x8, #304]
; CHECK-NEXT: stp q6, q17, [x8, #368]
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
index f5df5ea53c990..b3dfab8f69b59 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -95,18 +95,18 @@ define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: neg x10, x8
-; CHECK-NEXT: add x10, x10, #4
+; CHECK-NEXT: mov x10, xzr
+; CHECK-NEXT: neg x9, x8
+; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: b.ne .LBB2_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
-; CHECK-NEXT: cmp x10, x9
-; CHECK-NEXT: add x9, x9, x8
+; CHECK-NEXT: cmp x9, x10
+; CHECK-NEXT: add x10, x10, x8
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: .LBB2_3: // %middle.split
; CHECK-NEXT: ptest p0, p1.b
@@ -138,18 +138,18 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: neg x10, x8
-; CHECK-NEXT: add x10, x10, #4
+; CHECK-NEXT: mov x10, xzr
+; CHECK-NEXT: neg x9, x8
+; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: b.ne .LBB3_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT: cmp x10, x9
-; CHECK-NEXT: add x9, x9, x8
+; CHECK-NEXT: cmp x9, x10
+; CHECK-NEXT: add x10, x10, x8
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: .LBB3_3: // %middle.split
; CHECK-NEXT: ptest p0, p1.b
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 4d383fefc43c7..163124c0d2757 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -151,7 +151,7 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
; CHECK-NEXT: .LBB4_3: // %LI
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB4_6 Depth 2
-; CHECK-NEXT: mov x21, xzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add x23, x22, #1
; CHECK-NEXT: b .LBB4_6
; CHECK-NEXT: .LBB4_4: // %if.else
@@ -162,13 +162,13 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
; CHECK-NEXT: add x8, x21, #1
; CHECK-NEXT: str w0, [x20, x21, lsl #2]
; CHECK-NEXT: sub x9, x8, #1
-; CHECK-NEXT: mov x21, x8
; CHECK-NEXT: cmp x9, x19
; CHECK-NEXT: b.ge .LBB4_2
; CHECK-NEXT: .LBB4_6: // %LJ
; CHECK-NEXT: // Parent Loop BB4_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldr w8, [x20, x21, lsl #2]
+; CHECK-NEXT: mov x21, x8
+; CHECK-NEXT: ldr w8, [x20, x8, lsl #2]
; CHECK-NEXT: tbz w8, #31, .LBB4_4
; CHECK-NEXT: // %bb.7: // %if.then
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
diff --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
index d94fa6433bb7f..2fe3001ec0f44 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
@@ -14,22 +14,22 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
; CHECK-NEXT: // %bb.2: // %for.body.us.preheader
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: add x11, x2, x11, lsl #1
-; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov w9, wzr
; CHECK-NEXT: mov w10, wzr
; CHECK-NEXT: mov x12, #4 // =0x4
; CHECK-NEXT: mov x13, #8 // =0x8
; CHECK-NEXT: .LBB0_3: // %for.body.us
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB0_4 Depth 2
-; CHECK-NEXT: add x14, x0, x9, lsl #2
-; CHECK-NEXT: sbfiz x15, x8, #1, #32
+; CHECK-NEXT: add x14, x0, x8, lsl #2
+; CHECK-NEXT: sbfiz x15, x9, #1, #32
; CHECK-NEXT: mov x16, x2
; CHECK-NEXT: ldp s0, s1, [x14]
; CHECK-NEXT: add x15, x15, #8
; CHECK-NEXT: ldp s2, s3, [x14, #8]
-; CHECK-NEXT: ubfiz x14, x8, #1, #32
+; CHECK-NEXT: ubfiz x14, x9, #1, #32
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: fcvt h1, s1
; CHECK-NEXT: fcvt h2, s2
@@ -91,8 +91,8 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
; CHECK-NEXT: // %bb.5: // %while.cond.i..exit_crit_edge.us
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: add w10, w10, #1
-; CHECK-NEXT: add x9, x9, #4
-; CHECK-NEXT: add w8, w8, #16
+; CHECK-NEXT: add x8, x8, #4
+; CHECK-NEXT: add w9, w9, #16
; CHECK-NEXT: cmp w10, w1
; CHECK-NEXT: b.ne .LBB0_3
; CHECK-NEXT: .LBB0_6: // %exit78
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 124f81e7864d1..39fe92aae0619 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
; CHECK-NEXT: whilelt p0.s, wzr, w0
; CHECK-NEXT: b.pl .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: cntw x9
+; CHECK-NEXT: mov w9, wzr
+; CHECK-NEXT: cntw x8
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: whilelt p0.s, w8, w0
-; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: whilelt p0.s, w9, w0
+; CHECK-NEXT: add w9, w9, w8
; CHECK-NEXT: b.mi .LBB0_2
; CHECK-NEXT: .LBB0_3: // %exit
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index 07ee87e880aff..9949a48034815 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -564,10 +564,11 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10
; CHECK-O0-AARCH64-NEXT: mov w0, w8
; CHECK-O0-AARCH64-NEXT: bl _malloc
-; CHECK-O0-AARCH64-NEXT: mov x9, x0
+; CHECK-O0-AARCH64-NEXT: mov x1, x0
+; CHECK-O0-AARCH64-NEXT: mov x0, x1
+; CHECK-O0-AARCH64-NEXT: str x1, [sp, #8] ; 8-byte Folded Spill
; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1
-; CHECK-O0-AARCH64-NEXT: strb w8, [x9, #8]
-; CHECK-O0-AARCH64-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-O0-AARCH64-NEXT: strb w8, [x0, #8]
; CHECK-O0-AARCH64-NEXT: LBB4_3: ; %bb_cont
; CHECK-O0-AARCH64-NEXT: ; in Loop: Header=BB4_1 Depth=1
; CHECK-O0-AARCH64-NEXT: ldr s0, [sp, #16] ; 4-byte Folded Reload
@@ -605,12 +606,11 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10
; CHECK-O0-ARM64_32-NEXT: mov w0, w8
; CHECK-O0-ARM64_32-NEXT: bl _malloc
-; CHECK-O0-ARM64_32-NEXT: mov x9, x0
-; CHECK-O0-ARM64_32-NEXT: ; kill: def $x0 killed $x9
-; CHECK-O0-ARM64_32-NEXT: mov x0, x9
+; CHECK-O0-ARM64_32-NEXT: ; kill: def $x1 killed $x0
+; CHECK-O0-ARM64_32-NEXT: mov x1, x0
+; CHECK-O0-ARM64_32-NEXT: str x1, [sp, #8] ; 8-byte Folded Spill
; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1
-; CHECK-O0-ARM64_32-NEXT: strb w8, [x9, #8]
-; CHECK-O0-ARM64_32-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill
+; CHECK-O0-ARM64_32-NEXT: strb w8, [x0, #8]
; CHECK-O0-ARM64_32-NEXT: LBB4_3: ; %bb_cont
; CHECK-O0-ARM64_32-NEXT: ; in Loop: Header=BB4_1 Depth=1
; CHECK-O0-ARM64_32-NEXT: ldr s0, [sp, #16] ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index b5d64112db727..e73cbf1ee5df0 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -17,18 +17,18 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: .LBB0_3: // %vector.ph
; CHECK-NEXT: add x11, x8, #1
; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000
-; CHECK-NEXT: add x12, x0, #4
+; CHECK-NEXT: add x13, x0, #4
; CHECK-NEXT: and x10, x11, #0x1fffffff8
; CHECK-NEXT: dup v0.4s, w8
-; CHECK-NEXT: add x13, x1, #16
+; CHECK-NEXT: add x14, x1, #16
; CHECK-NEXT: add x8, x1, x10, lsl #2
+; CHECK-NEXT: mov x12, x10
; CHECK-NEXT: add x9, x0, x10
-; CHECK-NEXT: mov x14, x10
; CHECK-NEXT: .LBB0_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp q1, q2, [x13, #-16]
-; CHECK-NEXT: subs x14, x14, #8
-; CHECK-NEXT: add x13, x13, #32
+; CHECK-NEXT: ldp q1, q2, [x14, #-16]
+; CHECK-NEXT: subs x12, x12, #8
+; CHECK-NEXT: add x14, x14, #32
; CHECK-NEXT: fcmgt v3.4s, v1.4s, v0.4s
; CHECK-NEXT: fcmgt v4.4s, v2.4s, v0.4s
; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0
@@ -44,8 +44,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: uzp1 v1.8b, v1.8b, v0.8b
; CHECK-NEXT: uzp1 v2.8b, v2.8b, v0.8b
; CHECK-NEXT: mov v1.s[1], v2.s[0]
-; CHECK-NEXT: stur d1, [x12, #-4]
-; CHECK-NEXT: add x12, x12, #8
+; CHECK-NEXT: stur d1, [x13, #-4]
+; CHECK-NEXT: add x13, x13, #8
; CHECK-NEXT: b.ne .LBB0_4
; CHECK-NEXT: // %bb.5: // %middle.block
; CHECK-NEXT: cmp x11, x10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 666523c88860c..11afcebfada2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1670,23 +1670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s16
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT: v_max_f32_e32 v0, v0, v0
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX942-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v4, v2, v0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1718,22 +1718,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s20
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1743,23 +1743,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v1, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1769,23 +1769,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1986,23 +1986,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, s16
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[2:3], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2026,24 +2027,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, s16
-; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX11-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2077,24 +2079,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: v_mov_b32_e32 v10, v3
; GFX908-NEXT: v_mov_b32_e32 v9, v2
-; GFX908-NEXT: v_mov_b32_e32 v8, v1
-; GFX908-NEXT: v_mov_b32_e32 v7, v0
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX908-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v9
+; GFX908-NEXT: v_mov_b32_e32 v5, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2106,24 +2108,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v10, v3
; GFX8-NEXT: v_mov_b32_e32 v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, v0
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX8-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
+; GFX8-NEXT: v_mov_b32_e32 v4, v9
+; GFX8-NEXT: v_mov_b32_e32 v5, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 351502816ae6e..404da6a8a1ef7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -1670,23 +1670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s16
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s16
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT: v_max_f32_e32 v0, v0, v0
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX942-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v4, v2, v0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1718,22 +1718,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s20
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s20
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX90A-NEXT: v_min_f32_e32 v4, v2, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1743,23 +1743,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v3
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v1, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1769,23 +1769,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s20
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1986,23 +1986,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, s16
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[2:3], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2026,24 +2027,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, s16
-; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX11-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2077,24 +2079,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, s20
; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: v_mov_b32_e32 v10, v3
; GFX908-NEXT: v_mov_b32_e32 v9, v2
-; GFX908-NEXT: v_mov_b32_e32 v8, v1
-; GFX908-NEXT: v_mov_b32_e32 v7, v0
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX908-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v9
+; GFX908-NEXT: v_mov_b32_e32 v5, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2106,24 +2108,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, s20
; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v10, v3
; GFX8-NEXT: v_mov_b32_e32 v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, v0
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX8-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
+; GFX8-NEXT: v_mov_b32_e32 v4, v9
+; GFX8-NEXT: v_mov_b32_e32 v5, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 0da25260e2317..350f9ad882ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -387,11 +387,11 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB5_2 Depth 2
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
-; GFX10-NEXT: s_mov_b32 s4, s8
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
; GFX10-NEXT: v_mov_b32_e32 v8, s10
; GFX10-NEXT: v_mov_b32_e32 v9, s11
-; GFX10-NEXT: s_mov_b32 s10, 0
+; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s11, 0
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo
@@ -402,18 +402,18 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s11
; GFX10-NEXT: s_mov_b32 s12, exec_lo
; GFX10-NEXT: s_add_i32 s11, s11, 1
-; GFX10-NEXT: s_xor_b32 s4, s4, s12
+; GFX10-NEXT: s_xor_b32 s10, s10, s12
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0
-; GFX10-NEXT: s_or_b32 s10, vcc_lo, s10
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo
-; GFX10-NEXT: s_and_b32 s12, exec_lo, s4
+; GFX10-NEXT: s_and_b32 s12, exec_lo, s10
; GFX10-NEXT: s_or_b32 s9, s9, s12
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s10
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB5_2
; GFX10-NEXT: ; %bb.3: ; %UseInst
; GFX10-NEXT: ; in Loop: Header=BB5_1 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_mov_b32_e32 v9, s7
; GFX10-NEXT: v_mov_b32_e32 v8, s6
; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index f729de82cb042..67c4b349f9502 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -10475,7 +10475,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0
; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
@@ -10653,7 +10653,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3
@@ -11321,9 +11321,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 0
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
@@ -11381,72 +11381,72 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 1
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 2
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 3
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 4
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 5
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 6
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 7
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 8
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 9
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 10
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 11
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 12
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 13
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 14
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 15
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 17
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 18
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 19
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 20
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 21
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 22
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 23
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 24
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 25
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 26
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 27
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 28
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 29
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 30
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 31
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 0
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 1
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 1
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 2
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 17
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 18
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 19
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 20
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 21
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 22
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 23
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 24
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 25
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 26
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 27
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 28
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 29
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 30
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 31
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s57, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s57, 1
; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 2
; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 3
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16
@@ -11498,7 +11498,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s37, 5
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s101, 5
-; GFX11-FAKE16-NEXT: s_mov_b32 s101, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s44, 0
; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
@@ -11531,676 +11531,497 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s85, 29
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s86, 30
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s87, 31
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_2
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s25, 8
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s5, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 17
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s4, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 18
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 19
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 20
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 21
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 22
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 23
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 25
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 26
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 27
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 28
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 29
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 30
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 31
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[4:5], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s27, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s27, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s25, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s24, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s23, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s23, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s22, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s21, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s21, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s20, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s19, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s19, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s18, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s18, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s17, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s17, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s17, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s16, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s3, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12
; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s0, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 6
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[22:23], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 16
; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6
; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 9
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; GFX11-FAKE16-NEXT: s_branch .LBB13_3
-; GFX11-FAKE16-NEXT: .LBB13_2:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: s_mov_b32 s101, -1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 5
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 7
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 9
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 11
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 13
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 14
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 15
-; GFX11-FAKE16-NEXT: .LBB13_3: ; %Flow
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101
-; GFX11-FAKE16-NEXT: s_mov_b32 s101, s104
-; GFX11-FAKE16-NEXT: s_mov_b32 s104, s57
-; GFX11-FAKE16-NEXT: s_mov_b32 s57, s69
-; GFX11-FAKE16-NEXT: s_mov_b32 s69, s42
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_5
-; GFX11-FAKE16-NEXT: ; %bb.4: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[6:7], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[8:9], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3
+; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3
; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24
; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16
; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16
; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3
; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8
; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8
; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 16
; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3
; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8
; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 8
; GFX11-FAKE16-NEXT: s_add_i32 s41, s41, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24
; GFX11-FAKE16-NEXT: s_add_i32 s40, s40, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[4:5], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24
; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3
; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16
+; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 16
+; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24
; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3
; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s27, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s27, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s25, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s24, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s23, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s23, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s22, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s21, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s21, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s20, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s19, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s19, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s18, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s18, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s17, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s17, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s17, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s16, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s3, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s5, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s0, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[22:23], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14
-; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 2
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12
-; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[8:9], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10
-; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6
-; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s25, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 9
-; GFX11-FAKE16-NEXT: .LBB13_5: ; %end
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16
+; GFX11-FAKE16-NEXT: .LBB13_3: ; %end
+; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s101, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s45, s45, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44
+; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s30, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s44, s45, s44
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s42, s74, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43
-; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
-; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s45, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42
-; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 9
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s45
-; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s30, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s44, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45
-; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 8
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42
-; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v18, 7
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s100, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44
; GFX11-FAKE16-NEXT: s_and_b32 s44, s99, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16
; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45
-; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s97, 8
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s96, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s87, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44
+; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v18, 6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s86, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 5
-; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 2
; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s92, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s85, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s84, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s82, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s83, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 29
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s69, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s81, 8
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v19, 28
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 19
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 31
+; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s71, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s68, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s3, s19, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 30
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s70, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s86, 8
; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 21
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 27
-; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 24
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 22
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s67, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s66, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s64, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_and_b32 s1, s21, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s54, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 25
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s65, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s53, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 23
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 18
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s55, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s51, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s74, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3
; GFX11-FAKE16-NEXT: s_and_b32 s16, s23, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 20
+; GFX11-FAKE16-NEXT: s_and_b32 s17, s52, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s17, s17, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s17, s17, s18
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s17, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s97, 8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 17
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s50, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s49, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s88, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s69, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s18, s72, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 16
; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s48, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s39, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s38, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s73, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s96, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s37, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s36, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s76, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
; GFX11-FAKE16-NEXT: s_and_b32 s16, s27, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s87, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s35, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s18, s34, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s19, vcc_hi, 8
; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v18, 7
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
; GFX11-FAKE16-NEXT: s_and_b32 s0, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s85, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s84, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s104, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s103, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s72, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s29, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s83, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s82, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s81, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s102, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
@@ -12209,147 +12030,176 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 5
; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s61, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s18, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v19, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s62, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v18, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v18, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v18, 2
; GFX11-FAKE16-NEXT: s_and_b32 s16, s41, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s60, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s18, s71, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s70, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19
+; GFX11-FAKE16-NEXT: s_and_b32 s18, s18, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
+; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19
; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v18, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 0
; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s58, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s59, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s60, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 30
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s15, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s68, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s14, s67, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s66, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15
-; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 29
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s14, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s15, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s12, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s65, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s12, s64, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s14, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 27
+; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s58, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 24
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s14
; GFX11-FAKE16-NEXT: s_and_b32 s12, s13, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s55, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s14, s54, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s53, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13
-; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 25
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s15, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s14, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13
+; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15
; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s13
-; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 8
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 22
; GFX11-FAKE16-NEXT: s_and_b32 s0, s10, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s52, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s51, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s56, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s11, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s50, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s49, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s48, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11
-; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 19
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s8, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s39, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s38, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s46, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s10
; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s37, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s36, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s35, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 15
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s11, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 0
; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s56, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff
+; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s7, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s34, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_hi, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s46, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 9
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s104, 0xff
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s4, v19, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 15
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s6
; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s103, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s102, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s5, v19, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
@@ -12357,9 +12207,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:64
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 5
-; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 1
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:96
@@ -12368,8 +12216,13 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v17, 7
; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v17, 6
; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v17, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v17, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v16, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30
; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v16, 29
; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v16, 28
; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v16, 27
@@ -12378,6 +12231,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v16, 24
; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v16, 23
; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v16, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21
; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v16, 20
; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v16, 19
; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v16, 18
@@ -12397,6 +12251,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v16, 4
; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v16, 3
; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v16, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0
; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-FAKE16-NEXT: s_clause 0x3
; GFX11-FAKE16-NEXT: scratch_load_b32 v16, off, s32
@@ -12406,6 +12262,145 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB13_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi
+; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: s_branch .LBB13_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19705,7 +19700,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -19731,7 +19726,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
@@ -19740,11 +19735,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3
-; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5
-; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9
-; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11
+; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7
+; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -19765,7 +19760,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54
@@ -19773,22 +19768,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
@@ -19811,23 +19806,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13
+; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -19840,37 +19836,37 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328
; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52
-; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -19878,23 +19874,23 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156
; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268
; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -19907,57 +19903,57 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(12)
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB15_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19972,12 +19968,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20006,9 +20002,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v29, v9
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20031,15 +20025,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v50, v0
-; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20049,18 +20043,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v59, v0
; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v56, v0
-; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20068,7 +20062,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v38, v1
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v37, v0
@@ -20076,8 +20070,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20089,39 +20083,41 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v51, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v22
; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v32, v23
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v43, v49
-; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v34, v26
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v51, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v0
@@ -20131,28 +20127,26 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v46, v61
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v47, v45
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v58, v44
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v48, v0
-; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v48, v28
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
+; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20168,8 +20162,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20203,44 +20197,43 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB15_3
; VI-NEXT: .LBB15_2:
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v43, v49
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v46, v61
-; VI-NEXT: v_mov_b32_e32 v47, v45
-; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v34, v26
-; VI-NEXT: v_mov_b32_e32 v58, v44
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v34, v22
+; VI-NEXT: v_mov_b32_e32 v32, v23
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: v_mov_b32_e32 v51, v7
-; VI-NEXT: v_mov_b32_e32 v48, v29
+; VI-NEXT: v_mov_b32_e32 v48, v28
; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: .LBB15_3: ; %Flow
; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v44, v47
-; VI-NEXT: v_mov_b32_e32 v47, v46
+; VI-NEXT: v_mov_b32_e32 v42, v45
+; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_mov_b32_e32 v46, v49
; VI-NEXT: s_cbranch_vccnz .LBB15_5
; VI-NEXT: ; %bb.4: ; %cmp.true
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -20294,7 +20287,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
@@ -20303,8 +20296,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -20317,8 +20310,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -20393,29 +20386,29 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -20427,8 +20420,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -20440,8 +20433,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -20453,8 +20446,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -20465,8 +20458,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -20476,8 +20469,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -20488,8 +20481,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -20499,63 +20492,63 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -20565,54 +20558,57 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63
-; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -20897,7 +20893,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164
; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188
; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -20905,11 +20901,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -20935,7 +20931,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -21110,7 +21106,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -21122,7 +21118,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v37, v57
; GFX9-NEXT: v_mov_b32_e32 v57, v60
; GFX9-NEXT: v_mov_b32_e32 v52, v56
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v34, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -21131,14 +21127,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -21148,12 +21144,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v51, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -21207,7 +21203,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: .LBB15_2:
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -21569,12 +21565,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v40
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v44
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v55
@@ -21584,7 +21580,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT: v_add_u32_e32 v1, 3, v36
+; GFX9-NEXT: v_add_u32_e32 v1, 3, v42
; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -21593,7 +21589,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v42
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v36
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v41
@@ -24695,8 +24691,23 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: s_mov_b32 s72, s74
; SI-NEXT: s_mov_b32 s73, s75
; SI-NEXT: s_mov_b32 s74, s76
-; SI-NEXT: v_readlane_b32 s75, v21, 0
-; SI-NEXT: v_readlane_b32 s76, v21, 1
+; SI-NEXT: s_mov_b32 s75, s77
+; SI-NEXT: s_mov_b32 s76, s78
+; SI-NEXT: s_mov_b32 s77, s79
+; SI-NEXT: s_mov_b32 s78, s88
+; SI-NEXT: s_mov_b32 s79, s89
+; SI-NEXT: s_mov_b32 s88, s90
+; SI-NEXT: s_mov_b32 s89, s91
+; SI-NEXT: s_mov_b32 s90, s92
+; SI-NEXT: s_mov_b32 s91, s93
+; SI-NEXT: s_mov_b32 s92, s94
+; SI-NEXT: s_mov_b32 s93, s95
+; SI-NEXT: s_mov_b32 s94, s30
+; SI-NEXT: s_mov_b32 s95, s31
+; SI-NEXT: s_mov_b32 s30, s34
+; SI-NEXT: s_mov_b32 s31, s35
+; SI-NEXT: v_readlane_b32 s34, v21, 0
+; SI-NEXT: v_readlane_b32 s35, v21, 1
; SI-NEXT: s_cbranch_vccnz .LBB17_5
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: s_add_i32 s16, s16, 3
@@ -24758,22 +24769,22 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: s_lshl_b32 s62, s84, 16
; SI-NEXT: s_and_b32 s73, s83, 0xffff0000
; SI-NEXT: s_lshl_b32 s72, s83, 16
-; SI-NEXT: s_and_b32 s77, s82, 0xffff0000
+; SI-NEXT: s_and_b32 s75, s82, 0xffff0000
; SI-NEXT: s_lshl_b32 s74, s82, 16
-; SI-NEXT: s_and_b32 s79, s81, 0xffff0000
-; SI-NEXT: s_lshl_b32 s78, s81, 16
-; SI-NEXT: s_and_b32 s89, s80, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s80, 16
-; SI-NEXT: s_and_b32 s91, s71, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s71, 16
-; SI-NEXT: s_and_b32 s93, s70, 0xffff0000
-; SI-NEXT: s_lshl_b32 s92, s70, 16
-; SI-NEXT: s_and_b32 s95, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s94, s29, 16
-; SI-NEXT: s_and_b32 s31, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s30, s28, 16
-; SI-NEXT: s_and_b32 s35, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s34, s27, 16
+; SI-NEXT: s_and_b32 s77, s81, 0xffff0000
+; SI-NEXT: s_lshl_b32 s76, s81, 16
+; SI-NEXT: s_and_b32 s79, s80, 0xffff0000
+; SI-NEXT: s_lshl_b32 s78, s80, 16
+; SI-NEXT: s_and_b32 s89, s71, 0xffff0000
+; SI-NEXT: s_lshl_b32 s88, s71, 16
+; SI-NEXT: s_and_b32 s91, s70, 0xffff0000
+; SI-NEXT: s_lshl_b32 s90, s70, 16
+; SI-NEXT: s_and_b32 s93, s29, 0xffff0000
+; SI-NEXT: s_lshl_b32 s92, s29, 16
+; SI-NEXT: s_and_b32 s95, s28, 0xffff0000
+; SI-NEXT: s_lshl_b32 s94, s28, 16
+; SI-NEXT: s_and_b32 s31, s27, 0xffff0000
+; SI-NEXT: s_lshl_b32 s30, s27, 16
; SI-NEXT: s_and_b32 s37, s26, 0xffff0000
; SI-NEXT: s_lshl_b32 s36, s26, 16
; SI-NEXT: s_and_b32 s39, s25, 0xffff0000
@@ -24792,8 +24803,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: s_lshl_b32 s66, s19, 16
; SI-NEXT: s_and_b32 s69, s18, 0xffff0000
; SI-NEXT: s_lshl_b32 s68, s18, 16
-; SI-NEXT: s_and_b32 s76, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s17, 16
+; SI-NEXT: s_and_b32 s35, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s34, s17, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v21, s6, 2
; SI-NEXT: s_lshl_b32 s6, s16, 16
@@ -24807,9 +24818,9 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -24877,57 +24888,57 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
+; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
@@ -28367,10 +28378,10 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -28399,9 +28410,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
@@ -28412,12 +28423,13 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_mov_b32_e32 v59, v2
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
@@ -28427,10 +28439,11 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
@@ -28470,20 +28483,20 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2)
; SI-NEXT: v_mov_b32_e32 v35, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v43, v8
; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_mov_b32_e32 v60, v9
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v42, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
@@ -28507,7 +28520,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: v_mov_b32_e32 v33, v14
; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -28530,7 +28543,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: .LBB19_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -28546,7 +28559,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
@@ -28558,7 +28571,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
@@ -28670,7 +28683,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -28693,7 +28706,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
@@ -28708,7 +28721,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
@@ -28794,16 +28807,16 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB19_4:
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v61, v53
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -28812,7 +28825,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mov_b32_e32 v57, v11
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_mov_b32_e32 v38, v39
; SI-NEXT: v_mov_b32_e32 v39, v41
@@ -31878,36 +31891,96 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -31917,173 +31990,99 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB20_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v54, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v63
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v29
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v28
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v3
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v63
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; SI-NEXT: v_cvt_f32_f16_e32 v33, v31
@@ -32098,16 +32097,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10
; SI-NEXT: v_cvt_f32_f16_e32 v51, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
@@ -32115,28 +32104,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v62
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v29
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -32153,7 +32151,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: .LBB20_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB20_4
@@ -32161,87 +32174,78 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
+; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v50
-; SI-NEXT: v_mov_b32_e32 v50, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v48
-; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v56
-; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v34
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v61
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v47
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
+; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v55
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v63
+; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v62
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v40
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v42
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v62
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63
-; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
@@ -32251,39 +32255,43 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13
; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27
+; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
+; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27
+; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v42
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v40
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v5
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
@@ -32300,53 +32308,58 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v60, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_mov_b32_e32 v36, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v34
+; SI-NEXT: v_mov_b32_e32 v34, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v44
; SI-NEXT: v_cvt_f32_f16_e32 v33, v45
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_cvt_f32_f16_e32 v47, v5
; SI-NEXT: v_cvt_f32_f16_e32 v57, v4
; SI-NEXT: v_cvt_f32_f16_e32 v59, v3
; SI-NEXT: v_cvt_f32_f16_e32 v61, v2
-; SI-NEXT: v_mov_b32_e32 v52, v29
-; SI-NEXT: v_mov_b32_e32 v48, v30
-; SI-NEXT: v_mov_b32_e32 v56, v28
-; SI-NEXT: v_mov_b32_e32 v34, v7
-; SI-NEXT: v_mov_b32_e32 v32, v6
-; SI-NEXT: v_mov_b32_e32 v46, v8
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_mov_b32_e32 v55, v29
+; SI-NEXT: v_mov_b32_e32 v44, v8
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v45, v1
; SI-NEXT: .LBB20_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -32365,34 +32378,32 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v46
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
@@ -32401,25 +32412,25 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
@@ -32428,7 +32439,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
@@ -32437,7 +32448,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
@@ -32446,7 +32457,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v37
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -32455,7 +32466,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
@@ -32464,7 +32475,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -32474,8 +32485,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32485,8 +32496,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32496,8 +32507,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32507,8 +32518,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32518,8 +32529,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32529,8 +32540,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32540,8 +32551,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32551,8 +32562,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32562,8 +32573,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32573,8 +32584,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32584,8 +32595,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32594,46 +32605,48 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v44
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v55
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v50
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -37855,24 +37868,24 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21
@@ -37893,23 +37906,23 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -37922,45 +37935,46 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB27_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v7, v0, v61
+; SI-NEXT: v_or_b32_e32 v7, v0, v58
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v9, v0, v50
+; SI-NEXT: v_or_b32_e32 v9, v0, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v10, v0, v43
+; SI-NEXT: v_or_b32_e32 v10, v0, v50
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT: v_or_b32_e32 v11, v0, v41
+; SI-NEXT: v_or_b32_e32 v11, v0, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT: v_or_b32_e32 v12, v0, v40
+; SI-NEXT: v_or_b32_e32 v12, v0, v41
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT: v_mov_b32_e32 v52, v57
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
+; SI-NEXT: v_mov_b32_e32 v36, v41
+; SI-NEXT: v_mov_b32_e32 v41, v13
; SI-NEXT: v_or_b32_e32 v13, v0, v13
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v41, v14
-; SI-NEXT: v_or_b32_e32 v14, v0, v48
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v43
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
+; SI-NEXT: v_mov_b32_e32 v50, v45
+; SI-NEXT: v_mov_b32_e32 v45, v14
+; SI-NEXT: v_or_b32_e32 v14, v0, v40
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
+; SI-NEXT: v_mov_b32_e32 v52, v57
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
; SI-NEXT: v_or_b32_e32 v15, v0, v15
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_or_b32_e32 v16, v0, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_or_b32_e32 v17, v0, v17
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22
; SI-NEXT: s_waitcnt expcnt(0)
@@ -37994,7 +38008,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_and_b32 s6, s20, 0xffff
; SI-NEXT: s_lshl_b32 s7, s21, 16
; SI-NEXT: v_or_b32_e32 v26, v0, v26
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: s_and_b32 s7, s22, 0xffff
; SI-NEXT: s_lshl_b32 s8, s23, 16
@@ -38005,7 +38019,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshl_b32 s9, s25, 16
; SI-NEXT: v_mov_b32_e32 v33, v28
; SI-NEXT: v_or_b32_e32 v28, v0, v5
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43
; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s9, s26, 0xffff
; SI-NEXT: s_lshl_b32 s10, s27, 16
@@ -38017,7 +38031,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshl_b32 s11, s29, 16
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v30, v0, v3
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v32, v55
@@ -38025,9 +38039,9 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
-; SI-NEXT: v_or_b32_e32 v31, v0, v34
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
+; SI-NEXT: v_or_b32_e32 v31, v0, v48
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
@@ -38037,12 +38051,13 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v6, s10
; SI-NEXT: s_cbranch_execnz .LBB27_3
; SI-NEXT: .LBB27_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v32, v1
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v38, v43
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: s_and_b32 s4, s16, 0xffff
; SI-NEXT: s_lshl_b32 s5, s17, 16
@@ -38086,42 +38101,42 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v38, v0
+; SI-NEXT: v_or_b32_e32 v0, v34, v0
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v51, v0
+; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v50, v0
+; SI-NEXT: v_or_b32_e32 v0, v51, v0
; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v36, v0
+; SI-NEXT: v_or_b32_e32 v0, v50, v0
; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v0, v36, v0
; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v49, v0
+; SI-NEXT: v_or_b32_e32 v0, v41, v0
; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v43, v0
+; SI-NEXT: v_or_b32_e32 v0, v49, v0
; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v48, v0
+; SI-NEXT: v_or_b32_e32 v0, v40, v0
; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -38183,7 +38198,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -38199,12 +38214,12 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
@@ -38213,7 +38228,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -38248,26 +38263,26 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB27_4:
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
; SI-NEXT: v_mov_b32_e32 v32, v55
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v52, v57
; SI-NEXT: v_mov_b32_e32 v51, v50
; SI-NEXT: v_mov_b32_e32 v61, v56
-; SI-NEXT: v_mov_b32_e32 v50, v43
+; SI-NEXT: v_mov_b32_e32 v50, v45
; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
-; SI-NEXT: v_mov_b32_e32 v41, v14
+; SI-NEXT: v_mov_b32_e32 v41, v13
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
+; SI-NEXT: v_mov_b32_e32 v45, v14
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v39, v23
@@ -58422,7 +58437,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -58448,7 +58463,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
@@ -58457,11 +58472,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3
-; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5
-; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9
-; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11
+; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7
+; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -58482,7 +58497,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54
@@ -58490,22 +58505,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
@@ -58528,23 +58543,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13
+; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -58557,37 +58573,37 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328
; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52
-; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -58595,23 +58611,23 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156
; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268
; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -58624,57 +58640,57 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(12)
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB39_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58689,12 +58705,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58723,9 +58739,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v29, v9
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58748,15 +58762,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v50, v0
-; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58766,18 +58780,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v59, v0
; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v56, v0
-; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58785,7 +58799,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v38, v1
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v37, v0
@@ -58793,8 +58807,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58806,39 +58820,41 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v51, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v22
; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v32, v23
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v43, v49
-; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v34, v26
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v51, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v0
@@ -58848,28 +58864,26 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v46, v61
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v47, v45
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v58, v44
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v48, v0
-; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v48, v28
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
+; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58885,8 +58899,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58920,44 +58934,43 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB39_3
; VI-NEXT: .LBB39_2:
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v43, v49
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v46, v61
-; VI-NEXT: v_mov_b32_e32 v47, v45
-; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v34, v26
-; VI-NEXT: v_mov_b32_e32 v58, v44
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v34, v22
+; VI-NEXT: v_mov_b32_e32 v32, v23
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: v_mov_b32_e32 v51, v7
-; VI-NEXT: v_mov_b32_e32 v48, v29
+; VI-NEXT: v_mov_b32_e32 v48, v28
; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: .LBB39_3: ; %Flow
; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v44, v47
-; VI-NEXT: v_mov_b32_e32 v47, v46
+; VI-NEXT: v_mov_b32_e32 v42, v45
+; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_mov_b32_e32 v46, v49
; VI-NEXT: s_cbranch_vccnz .LBB39_5
; VI-NEXT: ; %bb.4: ; %cmp.true
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -59011,7 +59024,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
@@ -59020,8 +59033,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -59034,8 +59047,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -59110,29 +59123,29 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -59144,8 +59157,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -59157,8 +59170,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -59170,8 +59183,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -59182,8 +59195,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -59193,8 +59206,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -59205,8 +59218,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -59216,63 +59229,63 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -59282,54 +59295,57 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63
-; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -59614,7 +59630,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164
; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188
; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -59622,11 +59638,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -59652,7 +59668,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -59827,7 +59843,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -59839,7 +59855,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v37, v57
; GFX9-NEXT: v_mov_b32_e32 v57, v60
; GFX9-NEXT: v_mov_b32_e32 v52, v56
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v34, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -59848,14 +59864,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -59865,12 +59881,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v51, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -59924,7 +59940,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: .LBB39_2:
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -60286,12 +60302,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v40
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v44
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v55
@@ -60301,7 +60317,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT: v_add_u32_e32 v1, 3, v36
+; GFX9-NEXT: v_add_u32_e32 v1, 3, v42
; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -60310,7 +60326,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v42
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v36
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v41
@@ -67130,10 +67146,10 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -67162,9 +67178,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB43_4
@@ -67175,12 +67191,13 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_mov_b32_e32 v59, v2
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
@@ -67190,10 +67207,11 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
@@ -67233,20 +67251,20 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2)
; SI-NEXT: v_mov_b32_e32 v35, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v43, v8
; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_mov_b32_e32 v60, v9
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v42, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
@@ -67270,7 +67288,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: v_mov_b32_e32 v33, v14
; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -67293,7 +67311,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: .LBB43_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -67309,7 +67327,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
@@ -67321,7 +67339,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
@@ -67433,7 +67451,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -67456,7 +67474,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
@@ -67471,7 +67489,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
@@ -67557,16 +67575,16 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB43_4:
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v61, v53
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -67575,7 +67593,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: v_mov_b32_e32 v57, v11
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_mov_b32_e32 v38, v39
; SI-NEXT: v_mov_b32_e32 v39, v41
@@ -70641,36 +70659,96 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -70680,173 +70758,99 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB44_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v54, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v63
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v29
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v28
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v3
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v63
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; SI-NEXT: v_cvt_f32_f16_e32 v33, v31
@@ -70861,16 +70865,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10
; SI-NEXT: v_cvt_f32_f16_e32 v51, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
@@ -70878,28 +70872,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v62
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v29
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -70916,7 +70919,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: .LBB44_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB44_4
@@ -70924,89 +70942,80 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v5, 1.0, v5
; SI-NEXT: v_add_f32_e32 v33, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v6, 1.0, v6
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT: v_add_f32_e32 v7, 1.0, v7
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT: v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v50
-; SI-NEXT: v_mov_b32_e32 v50, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v48
-; SI-NEXT: v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24
-; SI-NEXT: v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23
-; SI-NEXT: v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v56
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; SI-NEXT: v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21
-; SI-NEXT: v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v34
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v31
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19
; SI-NEXT: v_add_f32_e32 v18, 1.0, v18
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18
-; SI-NEXT: v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v61
+; SI-NEXT: v_add_f32_e32 v17, 1.0, v17
; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17
; SI-NEXT: v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v61
-; SI-NEXT: v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_add_f32_e32 v26, 1.0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v47
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v55
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v25, 1.0, v25
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_add_f32_e32 v40, 1.0, v63
+; SI-NEXT: v_add_f32_e32 v42, 1.0, v62
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v40
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v42
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v62
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_add_f32_e32 v42, 1.0, v63
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v47
-; SI-NEXT: v_add_f32_e32 v44, 1.0, v62
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v4, 1.0, v4
; SI-NEXT: v_add_f32_e32 v10, 1.0, v10
; SI-NEXT: v_add_f32_e32 v11, 1.0, v11
@@ -71014,39 +71023,40 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v13, 1.0, v13
; SI-NEXT: v_add_f32_e32 v14, 1.0, v14
; SI-NEXT: v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27
+; SI-NEXT: v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT: v_add_f32_e32 v24, 1.0, v24
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27
+; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v42
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v40
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v5
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
@@ -71065,51 +71075,59 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v60, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_mov_b32_e32 v36, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v34
+; SI-NEXT: v_mov_b32_e32 v34, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v44
; SI-NEXT: v_cvt_f32_f16_e32 v33, v45
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_cvt_f32_f16_e32 v47, v5
; SI-NEXT: v_cvt_f32_f16_e32 v57, v4
; SI-NEXT: v_cvt_f32_f16_e32 v59, v3
; SI-NEXT: v_cvt_f32_f16_e32 v61, v2
-; SI-NEXT: v_mov_b32_e32 v52, v29
-; SI-NEXT: v_mov_b32_e32 v48, v30
-; SI-NEXT: v_mov_b32_e32 v56, v28
-; SI-NEXT: v_mov_b32_e32 v34, v7
-; SI-NEXT: v_mov_b32_e32 v32, v6
-; SI-NEXT: v_mov_b32_e32 v46, v8
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_mov_b32_e32 v55, v29
+; SI-NEXT: v_mov_b32_e32 v44, v8
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v45, v1
; SI-NEXT: .LBB44_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -71128,34 +71146,32 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v46
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
@@ -71164,25 +71180,25 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
@@ -71191,7 +71207,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
@@ -71200,7 +71216,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
@@ -71209,7 +71225,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v37
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -71218,7 +71234,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
@@ -71227,7 +71243,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -71237,8 +71253,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71248,8 +71264,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71259,8 +71275,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71270,8 +71286,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71281,8 +71297,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71292,8 +71308,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71303,8 +71319,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71314,8 +71330,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71325,8 +71341,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71336,8 +71352,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71347,8 +71363,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -71357,46 +71373,48 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v44
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v55
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v50
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -71629,12 +71647,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: s_lshr_b32 s4, s12, 16
; SI-NEXT: v_cvt_f32_f16_e32 v32, s4
; SI-NEXT: s_lshr_b32 s4, s13, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s4
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_cvt_f32_f16_e32 v61, s4
; SI-NEXT: s_lshr_b32 s4, s14, 16
; SI-NEXT: v_cvt_f32_f16_e32 v11, s4
; SI-NEXT: s_lshr_b32 s4, s15, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v63, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v18, s4
; SI-NEXT: s_lshr_b32 s4, s40, 16
; SI-NEXT: v_cvt_f32_f16_e32 v21, s4
; SI-NEXT: s_lshr_b32 s4, s41, 16
@@ -71690,8 +71708,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v6, s6
; SI-NEXT: v_cvt_f32_f16_e32 v7, s7
; SI-NEXT: v_cvt_f32_f16_e32 v9, s8
-; SI-NEXT: v_cvt_f32_f16_e32 v59, s10
-; SI-NEXT: v_cvt_f32_f16_e32 v61, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v13, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v59, s11
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v20, s15
; SI-NEXT: v_cvt_f32_f16_e32 v22, s40
@@ -71714,67 +71732,83 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v4, s20
; SI-NEXT: v_cvt_f32_f16_e32 v10, s19
; SI-NEXT: v_cvt_f32_f16_e32 v14, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v63, s17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s16
; SI-NEXT: s_cbranch_execnz .LBB45_3
; SI-NEXT: .LBB45_2: ; %cmp.true
+; SI-NEXT: v_add_f32_e64 v1, s18, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e64 v2, s19, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_add_f32_e64 v4, s20, 1.0
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_add_f32_e64 v20, s15, 1.0
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e64 v18, s27, 1.0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20
+; SI-NEXT: v_add_f32_e64 v49, s8, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e64 v15, s12, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT: v_add_f32_e64 v1, s18, 1.0
; SI-NEXT: v_add_f32_e64 v17, s13, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_add_f32_e64 v10, s23, 1.0
+; SI-NEXT: v_add_f32_e64 v16, s26, 1.0
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v17
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e64 v2, s19, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_add_f32_e64 v23, s29, 1.0
+; SI-NEXT: v_add_f32_e64 v29, s45, 1.0
; SI-NEXT: v_add_f32_e64 v22, s40, 1.0
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16
; SI-NEXT: v_add_f32_e64 v21, s28, 1.0
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_add_f32_e64 v19, s14, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v19
; SI-NEXT: v_cvt_f32_f16_e32 v19, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v56
; SI-NEXT: v_add_f32_e64 v41, s6, 1.0
; SI-NEXT: v_cvt_f32_f16_e32 v7, v41
-; SI-NEXT: v_add_f32_e64 v6, s21, 1.0
-; SI-NEXT: v_add_f32_e64 v10, s23, 1.0
+; SI-NEXT: v_add_f32_e64 v8, s22, 1.0
; SI-NEXT: v_add_f32_e64 v14, s25, 1.0
-; SI-NEXT: v_add_f32_e64 v18, s27, 1.0
-; SI-NEXT: v_add_f32_e64 v23, s29, 1.0
; SI-NEXT: v_add_f32_e64 v27, s46, 1.0
-; SI-NEXT: v_add_f32_e64 v26, s42, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23
-; SI-NEXT: v_add_f32_e64 v25, s47, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17
+; SI-NEXT: v_add_f32_e64 v24, s41, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; SI-NEXT: v_add_f32_e64 v37, s10, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37
; SI-NEXT: v_add_f32_e64 v53, s7, 1.0
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v42
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v57
-; SI-NEXT: v_add_f32_e64 v49, s8, 1.0
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v44
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v57
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -71782,17 +71816,16 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v53, v14
; SI-NEXT: v_cvt_f32_f16_e32 v14, v1
; SI-NEXT: v_add_f32_e64 v1, s17, 1.0
+; SI-NEXT: v_add_f32_e64 v6, s21, 1.0
; SI-NEXT: v_add_f32_e64 v28, s43, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49
; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41
; SI-NEXT: v_add_f32_e64 v45, s9, 1.0
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v18
; SI-NEXT: v_cvt_f32_f16_e32 v41, v10
; SI-NEXT: v_cvt_f32_f16_e32 v10, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v1
; SI-NEXT: v_add_f32_e64 v1, s16, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28
; SI-NEXT: v_add_f32_e64 v34, s11, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45
@@ -71800,35 +71833,24 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v45, v6
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v34
; SI-NEXT: v_cvt_f32_f16_e32 v34, v27
; SI-NEXT: v_cvt_f32_f16_e32 v27, v62
; SI-NEXT: v_cvt_f32_f16_e32 v62, v6
-; SI-NEXT: v_add_f32_e64 v4, s20, 1.0
-; SI-NEXT: v_add_f32_e64 v8, s22, 1.0
; SI-NEXT: v_add_f32_e64 v12, s24, 1.0
-; SI-NEXT: v_add_f32_e64 v16, s26, 1.0
-; SI-NEXT: v_add_f32_e64 v29, s45, 1.0
+; SI-NEXT: v_add_f32_e64 v25, s47, 1.0
; SI-NEXT: v_add_f32_e64 v30, s44, 1.0
-; SI-NEXT: v_add_f32_e64 v24, s41, 1.0
-; SI-NEXT: v_add_f32_e64 v20, s15, 1.0
-; SI-NEXT: v_add_f32_e64 v37, s10, 1.0
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29
+; SI-NEXT: v_add_f32_e64 v26, s42, 1.0
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v37
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v17
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
@@ -71836,7 +71858,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v25
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
@@ -71849,19 +71871,19 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v61
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v36
; SI-NEXT: v_cvt_f32_f16_e32 v29, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v56
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
; SI-NEXT: v_cvt_f32_f16_e32 v60, v2
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v58, v3
; SI-NEXT: .LBB45_3: ; %end
@@ -71873,7 +71895,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
@@ -72018,7 +72040,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v63
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
; SI-NEXT: v_cvt_f16_f32_e32 v2, v20
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -72035,7 +72057,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -72053,14 +72075,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -72117,7 +72139,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr58
@@ -72160,14 +72182,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr51
@@ -76571,24 +76593,24 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21
@@ -76609,23 +76631,23 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -76638,45 +76660,46 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB51_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v7, v0, v61
+; SI-NEXT: v_or_b32_e32 v7, v0, v58
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v9, v0, v50
+; SI-NEXT: v_or_b32_e32 v9, v0, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v10, v0, v43
+; SI-NEXT: v_or_b32_e32 v10, v0, v50
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT: v_or_b32_e32 v11, v0, v41
+; SI-NEXT: v_or_b32_e32 v11, v0, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT: v_or_b32_e32 v12, v0, v40
+; SI-NEXT: v_or_b32_e32 v12, v0, v41
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT: v_mov_b32_e32 v52, v57
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
+; SI-NEXT: v_mov_b32_e32 v36, v41
+; SI-NEXT: v_mov_b32_e32 v41, v13
; SI-NEXT: v_or_b32_e32 v13, v0, v13
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v41, v14
-; SI-NEXT: v_or_b32_e32 v14, v0, v48
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v43
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
+; SI-NEXT: v_mov_b32_e32 v50, v45
+; SI-NEXT: v_mov_b32_e32 v45, v14
+; SI-NEXT: v_or_b32_e32 v14, v0, v40
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
+; SI-NEXT: v_mov_b32_e32 v52, v57
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
; SI-NEXT: v_or_b32_e32 v15, v0, v15
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_or_b32_e32 v16, v0, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_or_b32_e32 v17, v0, v17
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22
; SI-NEXT: s_waitcnt expcnt(0)
@@ -76710,7 +76733,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_and_b32 s6, s20, 0xffff
; SI-NEXT: s_lshl_b32 s7, s21, 16
; SI-NEXT: v_or_b32_e32 v26, v0, v26
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: s_and_b32 s7, s22, 0xffff
; SI-NEXT: s_lshl_b32 s8, s23, 16
@@ -76721,7 +76744,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_lshl_b32 s9, s25, 16
; SI-NEXT: v_mov_b32_e32 v33, v28
; SI-NEXT: v_or_b32_e32 v28, v0, v5
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43
; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s9, s26, 0xffff
; SI-NEXT: s_lshl_b32 s10, s27, 16
@@ -76733,7 +76756,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_lshl_b32 s11, s29, 16
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v30, v0, v3
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v32, v55
@@ -76741,9 +76764,9 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
-; SI-NEXT: v_or_b32_e32 v31, v0, v34
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
+; SI-NEXT: v_or_b32_e32 v31, v0, v48
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
@@ -76753,12 +76776,13 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_mov_b32_e32 v6, s10
; SI-NEXT: s_cbranch_execnz .LBB51_3
; SI-NEXT: .LBB51_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v32, v1
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v38, v43
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: s_and_b32 s4, s16, 0xffff
; SI-NEXT: s_lshl_b32 s5, s17, 16
@@ -76802,42 +76826,42 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v38, v0
+; SI-NEXT: v_or_b32_e32 v0, v34, v0
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v51, v0
+; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v50, v0
+; SI-NEXT: v_or_b32_e32 v0, v51, v0
; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v36, v0
+; SI-NEXT: v_or_b32_e32 v0, v50, v0
; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v0, v36, v0
; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v49, v0
+; SI-NEXT: v_or_b32_e32 v0, v41, v0
; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v43, v0
+; SI-NEXT: v_or_b32_e32 v0, v49, v0
; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v48, v0
+; SI-NEXT: v_or_b32_e32 v0, v40, v0
; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -76899,7 +76923,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -76915,12 +76939,12 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
@@ -76929,7 +76953,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -76964,26 +76988,26 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB51_4:
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
; SI-NEXT: v_mov_b32_e32 v32, v55
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v52, v57
; SI-NEXT: v_mov_b32_e32 v51, v50
; SI-NEXT: v_mov_b32_e32 v61, v56
-; SI-NEXT: v_mov_b32_e32 v50, v43
+; SI-NEXT: v_mov_b32_e32 v50, v45
; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
-; SI-NEXT: v_mov_b32_e32 v41, v14
+; SI-NEXT: v_mov_b32_e32 v41, v13
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
+; SI-NEXT: v_mov_b32_e32 v45, v14
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v39, v23
@@ -85923,7 +85947,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0
; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
@@ -86101,7 +86125,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_3
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3
@@ -86774,9 +86798,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 0
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
@@ -86834,72 +86858,72 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 1
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 2
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 3
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 4
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 5
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 6
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 7
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 8
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 9
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 10
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 11
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 12
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 13
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 14
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 15
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 17
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 18
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 19
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 20
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 21
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 22
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 23
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 24
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 25
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 26
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 27
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 28
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 29
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 30
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 31
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 0
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 1
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 1
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 2
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 17
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 18
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 19
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 20
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 21
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 22
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 23
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 24
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 25
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 26
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 27
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 28
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 29
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 30
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s57, 31
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s57, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s57, 1
; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 2
; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 3
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16
@@ -86951,7 +86975,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s37, 5
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s101, 5
-; GFX11-FAKE16-NEXT: s_mov_b32 s101, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s44, 0
; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
@@ -86984,297 +87008,155 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s85, 29
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s86, 30
; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s87, 31
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_2
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s25, 8
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s5, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 17
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s4, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 18
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 19
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 20
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 21
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 22
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 23
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 25
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 26
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 27
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 28
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 29
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 30
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 31
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[4:5], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s27, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s27, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s25, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s24, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s23, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s23, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s22, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s21, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s21, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s20, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s19, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s19, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s18, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s18, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s17, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s17, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s17, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s16, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s3, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12
; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s0, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 6
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[22:23], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 16
; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6
; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 9
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; GFX11-FAKE16-NEXT: s_branch .LBB57_3
-; GFX11-FAKE16-NEXT: .LBB57_2:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: s_mov_b32 s101, -1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 5
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 7
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 9
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 10
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 11
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 12
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 13
-; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 14
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 15
-; GFX11-FAKE16-NEXT: .LBB57_3: ; %Flow
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101
-; GFX11-FAKE16-NEXT: s_mov_b32 s101, s104
-; GFX11-FAKE16-NEXT: s_mov_b32 s104, s57
-; GFX11-FAKE16-NEXT: s_mov_b32 s57, s69
-; GFX11-FAKE16-NEXT: s_mov_b32 s69, s42
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_5
-; GFX11-FAKE16-NEXT: ; %bb.4: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[6:7], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[8:9], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_3
+; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3
; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0
; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3
@@ -87307,355 +87189,321 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0
; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[4:5], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s27, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s27, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s26, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s25, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s25, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s24, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s23, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s22, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s21, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s21, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s20, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s20, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s19, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s19, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s19, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s18, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s17, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s17, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s16, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s3, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s3, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s5, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[10:11], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[22:23], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14
-; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 2
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[6:7], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12
-; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s46, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s47, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[8:9], 24
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 8
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16
; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 9
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; GFX11-FAKE16-NEXT: .LBB57_5: ; %end
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16
+; GFX11-FAKE16-NEXT: .LBB57_3: ; %end
+; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s101, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s45, s45, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44
+; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s30, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s44, s45, s44
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s42, s74, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43
-; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
-; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s45, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42
-; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 9
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s45
-; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s30, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s44, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45
-; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 8
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42
-; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v18, 7
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s100, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44
; GFX11-FAKE16-NEXT: s_and_b32 s44, s99, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16
; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45
-; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s97, 8
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s96, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s87, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44
+; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43
; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v18, 6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s86, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 5
-; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 2
; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s92, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s85, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s84, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s82, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s83, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 29
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s69, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s81, 8
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v19, 28
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 19
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 31
+; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s71, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s68, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s3, s19, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 30
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s70, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s86, 8
; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 21
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 27
-; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 24
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 22
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s67, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s66, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s64, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_and_b32 s1, s21, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s54, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 25
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s65, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s53, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 23
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 18
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s55, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s51, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s74, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3
; GFX11-FAKE16-NEXT: s_and_b32 s16, s23, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 20
+; GFX11-FAKE16-NEXT: s_and_b32 s17, s52, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s17, s17, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s17, s17, s18
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s17, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s97, 8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 17
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s50, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s49, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s88, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s69, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s18, s72, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 16
; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s48, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s39, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s38, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s73, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s96, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s37, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s36, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s76, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
; GFX11-FAKE16-NEXT: s_and_b32 s16, s27, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s87, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s35, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s18, s34, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s19, vcc_hi, 8
; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v18, 7
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
; GFX11-FAKE16-NEXT: s_and_b32 s0, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s85, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s84, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s104, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s103, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s72, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s29, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s83, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s82, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s81, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s102, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
@@ -87664,147 +87512,176 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 5
; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s61, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s18, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v19, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s62, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v18, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v18, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v18, 2
; GFX11-FAKE16-NEXT: s_and_b32 s16, s41, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s60, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s18, s71, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s70, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19
+; GFX11-FAKE16-NEXT: s_and_b32 s18, s18, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17
+; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19
; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 4
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v18, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 0
; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s58, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s59, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s60, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 30
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s15, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s68, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s14, s67, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s66, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15
-; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 29
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s14, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s15, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s12, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s65, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s12, s64, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s14, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 27
+; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s58, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 24
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s14
; GFX11-FAKE16-NEXT: s_and_b32 s12, s13, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s55, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s14, s54, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s53, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13
-; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 25
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s15, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s14, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13
+; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15
; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s13
-; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 8
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 22
; GFX11-FAKE16-NEXT: s_and_b32 s0, s10, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s52, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s51, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s56, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s11, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s50, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s49, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s48, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11
-; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 19
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s8, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s39, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s38, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s46, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xff
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s10
; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s37, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s36, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s35, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 15
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s11, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 0
; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s56, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff
+; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s7, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s34, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_hi, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s46, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 9
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s104, 0xff
+; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s4, v19, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 15
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s6
; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s103, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s102, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s5, v19, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
@@ -87812,9 +87689,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:64
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 5
-; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 1
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:96
@@ -87823,8 +87698,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v17, 7
; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v17, 6
; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v17, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1
; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v17, 0
; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v16, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30
; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v16, 29
; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v16, 28
; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v16, 27
@@ -87833,6 +87713,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v16, 24
; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v16, 23
; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v16, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21
; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v16, 20
; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v16, 19
; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v16, 18
@@ -87852,6 +87733,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v16, 4
; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v16, 3
; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v16, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0
; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-FAKE16-NEXT: s_clause 0x3
; GFX11-FAKE16-NEXT: scratch_load_b32 v16, off, s32
@@ -87861,6 +87744,145 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB57_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi
+; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: s_branch .LBB57_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -95160,7 +95182,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -95186,7 +95208,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
@@ -95195,11 +95217,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3
-; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5
-; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9
-; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11
+; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7
+; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -95220,7 +95242,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54
@@ -95228,22 +95250,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
@@ -95266,23 +95288,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13
+; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -95295,37 +95318,37 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328
; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52
-; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -95333,23 +95356,23 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156
; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268
; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -95362,57 +95385,57 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(12)
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB59_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95427,12 +95450,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95461,9 +95484,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v29, v9
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -95486,15 +95507,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v50, v0
-; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95504,18 +95525,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v59, v0
; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v56, v0
-; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -95523,7 +95544,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v38, v1
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v37, v0
@@ -95531,8 +95552,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -95544,39 +95565,41 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v51, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v22
; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v32, v23
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v43, v49
-; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v34, v26
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v51, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v0
@@ -95586,28 +95609,26 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v46, v61
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v47, v45
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v58, v44
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v48, v0
-; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v48, v28
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
+; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95623,8 +95644,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -95658,44 +95679,43 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB59_3
; VI-NEXT: .LBB59_2:
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v43, v49
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v46, v61
-; VI-NEXT: v_mov_b32_e32 v47, v45
-; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v34, v26
-; VI-NEXT: v_mov_b32_e32 v58, v44
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v34, v22
+; VI-NEXT: v_mov_b32_e32 v32, v23
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: v_mov_b32_e32 v51, v7
-; VI-NEXT: v_mov_b32_e32 v48, v29
+; VI-NEXT: v_mov_b32_e32 v48, v28
; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: .LBB59_3: ; %Flow
; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v44, v47
-; VI-NEXT: v_mov_b32_e32 v47, v46
+; VI-NEXT: v_mov_b32_e32 v42, v45
+; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_mov_b32_e32 v46, v49
; VI-NEXT: s_cbranch_vccnz .LBB59_5
; VI-NEXT: ; %bb.4: ; %cmp.true
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -95749,7 +95769,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
@@ -95758,8 +95778,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -95772,8 +95792,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -95848,29 +95868,29 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -95882,8 +95902,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -95895,8 +95915,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -95908,8 +95928,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -95920,8 +95940,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -95931,8 +95951,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -95943,8 +95963,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -95954,63 +95974,63 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -96020,54 +96040,57 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63
-; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -96352,7 +96375,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164
; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188
; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -96360,11 +96383,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -96390,7 +96413,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -96565,7 +96588,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -96577,7 +96600,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v37, v57
; GFX9-NEXT: v_mov_b32_e32 v57, v60
; GFX9-NEXT: v_mov_b32_e32 v52, v56
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v34, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -96586,14 +96609,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -96603,12 +96626,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v51, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -96662,7 +96685,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: .LBB59_2:
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -97024,12 +97047,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v40
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v44
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v55
@@ -97039,7 +97062,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT: v_add_u32_e32 v1, 3, v36
+; GFX9-NEXT: v_add_u32_e32 v1, 3, v42
; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -97048,7 +97071,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v42
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v36
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v41
@@ -103810,10 +103833,10 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -103842,9 +103865,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB63_4
@@ -103855,12 +103878,13 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_mov_b32_e32 v59, v2
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
@@ -103870,10 +103894,11 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
@@ -103913,20 +103938,20 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2)
; SI-NEXT: v_mov_b32_e32 v35, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v43, v8
; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_mov_b32_e32 v60, v9
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v42, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
@@ -103950,7 +103975,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: v_mov_b32_e32 v33, v14
; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -103973,7 +103998,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: .LBB63_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -103989,7 +104014,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
@@ -104001,7 +104026,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
@@ -104113,7 +104138,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -104136,7 +104161,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
@@ -104151,7 +104176,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
@@ -104237,16 +104262,16 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB63_4:
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v61, v53
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -104255,7 +104280,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_mov_b32_e32 v57, v11
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_mov_b32_e32 v38, v39
; SI-NEXT: v_mov_b32_e32 v39, v41
@@ -107321,36 +107346,98 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -107358,175 +107445,104 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB64_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
; SI-NEXT: v_cvt_f32_f16_e32 v40, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
; SI-NEXT: v_cvt_f32_f16_e32 v54, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v63
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v28
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v32
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v63
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v29
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; SI-NEXT: v_cvt_f32_f16_e32 v33, v31
@@ -107539,18 +107555,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; SI-NEXT: v_cvt_f32_f16_e32 v49, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
@@ -107558,28 +107564,33 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v62
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -107596,7 +107607,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: .LBB64_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB64_4
@@ -107606,9 +107632,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9
; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
@@ -107619,114 +107645,111 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
-; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc
-; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
-; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21
-; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9
-; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26
-; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v50
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
+; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
+; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v48
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
-; SI-NEXT: v_mov_b32_e32 v38, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v34
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v31
; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v32
-; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v2
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14
-; SI-NEXT: v_addc_u32_e32 v44, vcc, 0, v62, vcc
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v61
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v44
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v42
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10
+; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v50
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v41
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT: v_mov_b32_e32 v55, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35
; SI-NEXT: v_cvt_f32_f16_e32 v58, v35
; SI-NEXT: v_cvt_f32_f16_e32 v35, v43
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
+; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v63
+; SI-NEXT: v_addc_u32_e32 v42, vcc, 0, v62, vcc
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v53
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v53
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v62
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29
; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v42
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v6
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
@@ -107744,51 +107767,59 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v60, v33
; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_mov_b32_e32 v36, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v34
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v34, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v44
; SI-NEXT: v_cvt_f32_f16_e32 v33, v45
; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v2
; SI-NEXT: v_cvt_f32_f16_e32 v47, v5
; SI-NEXT: v_cvt_f32_f16_e32 v57, v4
; SI-NEXT: v_cvt_f32_f16_e32 v59, v3
; SI-NEXT: v_cvt_f32_f16_e32 v61, v6
-; SI-NEXT: v_mov_b32_e32 v50, v29
-; SI-NEXT: v_mov_b32_e32 v48, v30
-; SI-NEXT: v_mov_b32_e32 v46, v28
-; SI-NEXT: v_mov_b32_e32 v34, v8
-; SI-NEXT: v_mov_b32_e32 v32, v7
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v52, v30
+; SI-NEXT: v_mov_b32_e32 v44, v29
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v45, v1
; SI-NEXT: .LBB64_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -107814,63 +107845,59 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v46
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v51
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v53
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
@@ -107879,7 +107906,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
@@ -107888,7 +107915,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v37
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -107897,7 +107924,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
@@ -107906,7 +107933,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -107916,9 +107943,20 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107929,7 +107967,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107940,7 +107978,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107951,7 +107989,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107962,7 +108000,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107973,7 +108011,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107982,9 +108020,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -107994,8 +108032,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -108006,7 +108044,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -108017,7 +108055,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -108028,7 +108066,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -108038,8 +108076,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -108047,35 +108085,28 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v46
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v44
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -113327,24 +113358,24 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21
@@ -113365,23 +113396,23 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -113394,45 +113425,46 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB71_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v7, v0, v61
+; SI-NEXT: v_or_b32_e32 v7, v0, v58
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v9, v0, v50
+; SI-NEXT: v_or_b32_e32 v9, v0, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v10, v0, v43
+; SI-NEXT: v_or_b32_e32 v10, v0, v50
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT: v_or_b32_e32 v11, v0, v41
+; SI-NEXT: v_or_b32_e32 v11, v0, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT: v_or_b32_e32 v12, v0, v40
+; SI-NEXT: v_or_b32_e32 v12, v0, v41
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT: v_mov_b32_e32 v52, v57
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
+; SI-NEXT: v_mov_b32_e32 v36, v41
+; SI-NEXT: v_mov_b32_e32 v41, v13
; SI-NEXT: v_or_b32_e32 v13, v0, v13
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v41, v14
-; SI-NEXT: v_or_b32_e32 v14, v0, v48
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v43
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
+; SI-NEXT: v_mov_b32_e32 v50, v45
+; SI-NEXT: v_mov_b32_e32 v45, v14
+; SI-NEXT: v_or_b32_e32 v14, v0, v40
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
+; SI-NEXT: v_mov_b32_e32 v52, v57
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
; SI-NEXT: v_or_b32_e32 v15, v0, v15
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_or_b32_e32 v16, v0, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_or_b32_e32 v17, v0, v17
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22
; SI-NEXT: s_waitcnt expcnt(0)
@@ -113466,7 +113498,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_and_b32 s6, s20, 0xffff
; SI-NEXT: s_lshl_b32 s7, s21, 16
; SI-NEXT: v_or_b32_e32 v26, v0, v26
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: s_and_b32 s7, s22, 0xffff
; SI-NEXT: s_lshl_b32 s8, s23, 16
@@ -113477,7 +113509,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshl_b32 s9, s25, 16
; SI-NEXT: v_mov_b32_e32 v33, v28
; SI-NEXT: v_or_b32_e32 v28, v0, v5
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43
; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s9, s26, 0xffff
; SI-NEXT: s_lshl_b32 s10, s27, 16
@@ -113489,7 +113521,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshl_b32 s11, s29, 16
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v30, v0, v3
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v32, v55
@@ -113497,9 +113529,9 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
-; SI-NEXT: v_or_b32_e32 v31, v0, v34
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
+; SI-NEXT: v_or_b32_e32 v31, v0, v48
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
@@ -113509,12 +113541,13 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v6, s10
; SI-NEXT: s_cbranch_execnz .LBB71_3
; SI-NEXT: .LBB71_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v32, v1
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v38, v43
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: s_and_b32 s4, s16, 0xffff
; SI-NEXT: s_lshl_b32 s5, s17, 16
@@ -113558,42 +113591,42 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v38, v0
+; SI-NEXT: v_or_b32_e32 v0, v34, v0
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v51, v0
+; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v50, v0
+; SI-NEXT: v_or_b32_e32 v0, v51, v0
; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v36, v0
+; SI-NEXT: v_or_b32_e32 v0, v50, v0
; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v0, v36, v0
; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v49, v0
+; SI-NEXT: v_or_b32_e32 v0, v41, v0
; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v43, v0
+; SI-NEXT: v_or_b32_e32 v0, v49, v0
; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v48, v0
+; SI-NEXT: v_or_b32_e32 v0, v40, v0
; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -113655,7 +113688,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -113671,12 +113704,12 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
@@ -113685,7 +113718,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -113720,26 +113753,26 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB71_4:
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
; SI-NEXT: v_mov_b32_e32 v32, v55
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v52, v57
; SI-NEXT: v_mov_b32_e32 v51, v50
; SI-NEXT: v_mov_b32_e32 v61, v56
-; SI-NEXT: v_mov_b32_e32 v50, v43
+; SI-NEXT: v_mov_b32_e32 v50, v45
; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
-; SI-NEXT: v_mov_b32_e32 v41, v14
+; SI-NEXT: v_mov_b32_e32 v41, v13
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
+; SI-NEXT: v_mov_b32_e32 v45, v14
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v39, v23
@@ -120323,7 +120356,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
@@ -120341,7 +120374,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v30
; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v30
; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v30
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32
; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v32
@@ -120509,9 +120542,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v33, s71
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s69
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s68
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s68
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s67
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s66
@@ -120833,12 +120866,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v31
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32
; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -131843,7 +131876,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -131869,7 +131902,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
@@ -131878,11 +131911,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3
-; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5
-; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9
-; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11
+; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v5
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v7
+; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -131903,7 +131936,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54
@@ -131911,22 +131944,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
@@ -131949,23 +131982,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13
+; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v13
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -131978,37 +132012,37 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328
; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52
-; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -132016,23 +132050,23 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:124
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:140
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:156
; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180
-; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:180
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:188
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:204
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268
; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -132045,57 +132079,57 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(12)
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB75_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132110,12 +132144,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_lshl_b32 s7, s23, 8
; VI-NEXT: s_lshl_b32 s8, s27, 8
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132144,9 +132178,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v29, v9
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -132169,15 +132201,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v50, v0
-; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132187,18 +132219,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v59, v0
; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v56, v0
-; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -132206,7 +132238,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v38, v1
; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v37, v0
@@ -132214,8 +132246,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -132227,39 +132259,41 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v51, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v22
; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v32, v23
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v43, v49
-; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v34, v26
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v51, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v54, v0
@@ -132269,28 +132303,26 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_mov_b32_e32 v46, v61
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v47, v45
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v58, v44
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v48, v0
-; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v48, v28
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
+; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132306,8 +132338,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v57, v0
; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -132341,44 +132373,43 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: s_branch .LBB75_3
; VI-NEXT: .LBB75_2:
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v32, v54
-; VI-NEXT: v_mov_b32_e32 v43, v49
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v46, v61
-; VI-NEXT: v_mov_b32_e32 v47, v45
-; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v34, v26
-; VI-NEXT: v_mov_b32_e32 v58, v44
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v34, v22
+; VI-NEXT: v_mov_b32_e32 v32, v23
+; VI-NEXT: v_mov_b32_e32 v47, v58
+; VI-NEXT: v_mov_b32_e32 v45, v44
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_mov_b32_e32 v63, v42
; VI-NEXT: v_mov_b32_e32 v51, v7
-; VI-NEXT: v_mov_b32_e32 v48, v29
+; VI-NEXT: v_mov_b32_e32 v48, v28
; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: .LBB75_3: ; %Flow
; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v44, v47
-; VI-NEXT: v_mov_b32_e32 v47, v46
+; VI-NEXT: v_mov_b32_e32 v42, v45
+; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_mov_b32_e32 v46, v49
; VI-NEXT: s_cbranch_vccnz .LBB75_5
; VI-NEXT: ; %bb.4: ; %cmp.true
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -132432,7 +132463,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
@@ -132441,8 +132472,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -132455,8 +132486,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -132531,29 +132562,29 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -132565,8 +132596,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -132578,8 +132609,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -132591,8 +132622,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -132603,8 +132634,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -132614,8 +132645,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
@@ -132626,8 +132657,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -132637,63 +132668,63 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -132703,54 +132734,57 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63
-; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -133035,7 +133069,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164
; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:188
; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -133043,11 +133077,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276
-; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292
; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -133073,7 +133107,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -133248,7 +133282,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -133260,7 +133294,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v37, v57
; GFX9-NEXT: v_mov_b32_e32 v57, v60
; GFX9-NEXT: v_mov_b32_e32 v52, v56
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_mov_b32_e32 v34, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -133269,14 +133303,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -133286,12 +133320,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v51, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -133345,7 +133379,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: .LBB75_2:
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -133707,12 +133741,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v40
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v44
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v55
@@ -133722,7 +133756,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT: v_add_u32_e32 v1, 3, v36
+; GFX9-NEXT: v_add_u32_e32 v1, 3, v42
; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -133731,7 +133765,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v0, 3, v42
+; GFX9-NEXT: v_add_u32_e32 v0, 3, v36
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v1, 3, v41
@@ -136664,19 +136698,20 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_writelane_b32 v62, s46, 3
; SI-NEXT: s_cbranch_execnz .LBB77_4
; SI-NEXT: .LBB77_2: ; %cmp.true
-; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0
+; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0
-; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0
+; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0
; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42
; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42
@@ -136705,8 +136740,7 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0
; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0
; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0
-; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0
-; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
+; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0
; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0
; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8
@@ -136715,27 +136749,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v28
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v36
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36
; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50
@@ -136826,14 +136860,14 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: ; kill: killed $sgpr46
; SI-NEXT: s_branch .LBB77_2
; SI-NEXT: .LBB77_4:
-; SI-NEXT: v_mov_b32_e32 v1, s71
+; SI-NEXT: v_mov_b32_e32 v1, s85
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s69
+; SI-NEXT: v_mov_b32_e32 v1, s83
; SI-NEXT: v_readlane_b32 s4, v62, 0
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s68
+; SI-NEXT: v_mov_b32_e32 v1, s82
; SI-NEXT: v_mov_b32_e32 v61, s4
; SI-NEXT: v_readlane_b32 s4, v62, 1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -136852,27 +136886,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_mov_b32_e32 v14, s96
; SI-NEXT: v_mov_b32_e32 v21, s87
; SI-NEXT: v_mov_b32_e32 v18, s86
-; SI-NEXT: v_mov_b32_e32 v25, s85
; SI-NEXT: v_mov_b32_e32 v22, s84
-; SI-NEXT: v_mov_b32_e32 v29, s83
-; SI-NEXT: v_mov_b32_e32 v26, s82
-; SI-NEXT: v_mov_b32_e32 v33, s81
-; SI-NEXT: v_mov_b32_e32 v30, s80
-; SI-NEXT: v_mov_b32_e32 v34, s70
+; SI-NEXT: v_mov_b32_e32 v16, s81
+; SI-NEXT: v_mov_b32_e32 v15, s80
+; SI-NEXT: v_mov_b32_e32 v12, s71
+; SI-NEXT: v_mov_b32_e32 v11, s70
+; SI-NEXT: v_mov_b32_e32 v20, s69
+; SI-NEXT: v_mov_b32_e32 v19, s68
; SI-NEXT: v_mov_b32_e32 v8, s67
; SI-NEXT: v_mov_b32_e32 v7, s66
; SI-NEXT: v_mov_b32_e32 v24, s65
; SI-NEXT: v_mov_b32_e32 v23, s64
-; SI-NEXT: v_mov_b32_e32 v16, s55
-; SI-NEXT: v_mov_b32_e32 v15, s54
+; SI-NEXT: v_mov_b32_e32 v26, s55
+; SI-NEXT: v_mov_b32_e32 v25, s54
; SI-NEXT: v_mov_b32_e32 v28, s53
; SI-NEXT: v_mov_b32_e32 v27, s52
-; SI-NEXT: v_mov_b32_e32 v12, s51
-; SI-NEXT: v_mov_b32_e32 v11, s50
+; SI-NEXT: v_mov_b32_e32 v30, s51
+; SI-NEXT: v_mov_b32_e32 v29, s50
; SI-NEXT: v_mov_b32_e32 v32, s49
; SI-NEXT: v_mov_b32_e32 v31, s48
-; SI-NEXT: v_mov_b32_e32 v20, s39
-; SI-NEXT: v_mov_b32_e32 v19, s38
+; SI-NEXT: v_mov_b32_e32 v34, s39
+; SI-NEXT: v_mov_b32_e32 v33, s38
; SI-NEXT: v_mov_b32_e32 v36, s37
; SI-NEXT: v_mov_b32_e32 v35, s36
; SI-NEXT: v_mov_b32_e32 v38, s35
@@ -137005,9 +137039,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -137019,9 +137053,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -137033,9 +137067,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -137054,6 +137088,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
+; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
+; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15
+; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s99, v63, 35
@@ -137098,36 +137153,15 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -140441,10 +140475,10 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s22
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -140473,9 +140507,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB79_4
@@ -140486,12 +140520,13 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_alignbit_b32 v3, v3, v35, 16
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_mov_b32_e32 v59, v2
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
@@ -140501,10 +140536,11 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_alignbit_b32 v1, v1, v33, 16
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
@@ -140544,20 +140580,20 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(2)
; SI-NEXT: v_mov_b32_e32 v35, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v43, v8
; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_mov_b32_e32 v60, v9
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v42, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
@@ -140581,7 +140617,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
+; SI-NEXT: v_mov_b32_e32 v33, v14
; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -140604,7 +140640,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: .LBB79_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -140620,7 +140656,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
@@ -140632,7 +140668,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
@@ -140744,7 +140780,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -140767,7 +140803,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
@@ -140782,7 +140818,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
@@ -140868,16 +140904,16 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB79_4:
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v61, v53
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -140886,7 +140922,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: v_mov_b32_e32 v57, v11
; SI-NEXT: v_mov_b32_e32 v47, v10
; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
+; SI-NEXT: v_mov_b32_e32 v44, v14
; SI-NEXT: v_mov_b32_e32 v62, v38
; SI-NEXT: v_mov_b32_e32 v38, v39
; SI-NEXT: v_mov_b32_e32 v39, v41
@@ -144853,70 +144889,68 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: s_lshr_b32 s46, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v10, s46
; SI-NEXT: s_lshr_b32 s46, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v55, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v41, s46
; SI-NEXT: s_lshr_b32 s46, s7, 16
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_cvt_f32_f16_e32 v59, s46
-; SI-NEXT: s_lshr_b32 s46, s6, 16
; SI-NEXT: v_cvt_f32_f16_e32 v8, s46
-; SI-NEXT: s_lshr_b32 s46, s9, 16
+; SI-NEXT: s_lshr_b32 s46, s6, 16
; SI-NEXT: v_cvt_f32_f16_e32 v9, s46
-; SI-NEXT: s_lshr_b32 s46, s8, 16
+; SI-NEXT: s_lshr_b32 s46, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v16, s46
-; SI-NEXT: s_lshr_b32 s46, s11, 16
+; SI-NEXT: s_lshr_b32 s46, s8, 16
; SI-NEXT: v_cvt_f32_f16_e32 v23, s46
-; SI-NEXT: s_lshr_b32 s46, s10, 16
+; SI-NEXT: s_lshr_b32 s46, s11, 16
; SI-NEXT: v_cvt_f32_f16_e32 v27, s46
-; SI-NEXT: s_lshr_b32 s46, s13, 16
+; SI-NEXT: s_lshr_b32 s46, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v31, s46
-; SI-NEXT: s_lshr_b32 s46, s12, 16
+; SI-NEXT: s_lshr_b32 s46, s13, 16
; SI-NEXT: v_cvt_f32_f16_e32 v38, s46
-; SI-NEXT: s_lshr_b32 s46, s15, 16
+; SI-NEXT: s_lshr_b32 s46, s12, 16
; SI-NEXT: v_cvt_f32_f16_e32 v50, s46
-; SI-NEXT: s_lshr_b32 s46, s14, 16
+; SI-NEXT: s_lshr_b32 s46, s15, 16
; SI-NEXT: v_cvt_f32_f16_e32 v54, s46
-; SI-NEXT: s_lshr_b32 s46, s41, 16
+; SI-NEXT: s_lshr_b32 s46, s14, 16
; SI-NEXT: v_cvt_f32_f16_e32 v40, s46
-; SI-NEXT: s_lshr_b32 s46, s40, 16
+; SI-NEXT: s_lshr_b32 s46, s41, 16
; SI-NEXT: v_cvt_f32_f16_e32 v42, s46
+; SI-NEXT: s_lshr_b32 s46, s40, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v56, s46
; SI-NEXT: s_lshr_b32 s46, s43, 16
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v61, s46
-; SI-NEXT: s_lshr_b32 s46, s42, 16
; SI-NEXT: v_cvt_f32_f16_e32 v12, s46
+; SI-NEXT: s_lshr_b32 s46, s42, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v35, s46
; SI-NEXT: s_lshr_b32 s46, s45, 16
; SI-NEXT: v_cvt_f32_f16_e32 v32, s46
; SI-NEXT: s_lshr_b32 s46, s44, 16
; SI-NEXT: v_cvt_f32_f16_e32 v36, s46
; SI-NEXT: s_lshr_b32 s46, s29, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v28, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s46
; SI-NEXT: s_lshr_b32 s46, s28, 16
; SI-NEXT: v_cvt_f32_f16_e32 v48, s46
; SI-NEXT: s_lshr_b32 s46, s27, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s42
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
; SI-NEXT: v_cvt_f32_f16_e32 v24, s46
; SI-NEXT: s_lshr_b32 s46, s26, 16
; SI-NEXT: v_cvt_f32_f16_e32 v52, s46
; SI-NEXT: s_lshr_b32 s46, s25, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v28, s46
; SI-NEXT: s_lshr_b32 s46, s24, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v20, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v17, s46
; SI-NEXT: s_lshr_b32 s46, s23, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s45
-; SI-NEXT: v_cvt_f32_f16_e32 v17, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v20, s46
; SI-NEXT: s_lshr_b32 s46, s22, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v35, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v13, s46
; SI-NEXT: s_lshr_b32 s46, s21, 16
; SI-NEXT: v_cvt_f32_f16_e32 v46, s46
; SI-NEXT: s_lshr_b32 s46, s20, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v45, s46
; SI-NEXT: s_lshr_b32 s46, s19, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s44
-; SI-NEXT: v_cvt_f32_f16_e32 v57, s46
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s46
; SI-NEXT: s_lshr_b32 s46, s18, 16
; SI-NEXT: v_cvt_f32_f16_e32 v58, s46
; SI-NEXT: s_lshr_b32 s46, s17, 16
@@ -144924,8 +144958,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: s_lshr_b32 s46, s16, 16
; SI-NEXT: v_cvt_f32_f16_e32 v62, s46
; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v56, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
; SI-NEXT: v_cvt_f32_f16_e32 v15, s6
; SI-NEXT: v_cvt_f32_f16_e32 v6, s9
; SI-NEXT: v_cvt_f32_f16_e32 v18, s8
@@ -144933,11 +144966,12 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v19, s10
; SI-NEXT: v_cvt_f32_f16_e32 v39, s13
; SI-NEXT: v_cvt_f32_f16_e32 v51, s12
-; SI-NEXT: v_cvt_f32_f16_e32 v41, s15
+; SI-NEXT: v_cvt_f32_f16_e32 v55, s15
; SI-NEXT: v_cvt_f32_f16_e32 v43, s14
-; SI-NEXT: v_cvt_f32_f16_e32 v45, s41
-; SI-NEXT: v_cvt_f32_f16_e32 v44, s40
-; SI-NEXT: v_cvt_f32_f16_e32 v63, s43
+; SI-NEXT: v_cvt_f32_f16_e32 v44, s41
+; SI-NEXT: v_cvt_f32_f16_e32 v57, s40
+; SI-NEXT: v_cvt_f32_f16_e32 v59, s43
+; SI-NEXT: v_cvt_f32_f16_e32 v61, s42
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v37, s29
; SI-NEXT: v_cvt_f32_f16_e32 v22, s28
@@ -144951,7 +144985,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v47, s20
; SI-NEXT: v_cvt_f32_f16_e32 v34, s19
; SI-NEXT: v_cvt_f32_f16_e32 v25, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v63, s17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s16
; SI-NEXT: s_cbranch_execnz .LBB81_3
@@ -144976,143 +145010,146 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49
+; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36
-; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0
-; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v28
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v29
+; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v29
; SI-NEXT: v_cvt_f32_f16_e32 v29, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v24
-; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v24
; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v26
+; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v26
; SI-NEXT: v_cvt_f32_f16_e32 v26, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v17
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v33
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v7
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v34
; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0
-; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v14
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v21
; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0
; SI-NEXT: v_cvt_f32_f16_e32 v43, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v30
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
; SI-NEXT: v_cvt_f32_f16_e32 v30, v2
; SI-NEXT: v_cvt_f32_f16_e32 v21, v1
; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v62, v62
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v7
+; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v51, v18
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v25
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v25
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v34, v2
; SI-NEXT: v_cvt_f32_f16_e32 v25, v1
; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v58
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v7
-; SI-NEXT: v_mov_b32_e32 v7, v61
-; SI-NEXT: v_mov_b32_e32 v61, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v62, v19
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v28
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v17
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v62, v62
; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
; SI-NEXT: v_cvt_f32_f16_e32 v6, v11
; SI-NEXT: v_cvt_f32_f16_e32 v11, v15
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v19
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v15
+; SI-NEXT: v_mov_b32_e32 v15, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v18
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v18, v12
+; SI-NEXT: v_mov_b32_e32 v12, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v62, v19
; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v47
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v32
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v58
; SI-NEXT: v_cvt_f32_f16_e32 v32, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v15
-; SI-NEXT: v_mov_b32_e32 v15, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v14
-; SI-NEXT: v_mov_b32_e32 v14, v12
-; SI-NEXT: v_mov_b32_e32 v12, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v7
; SI-NEXT: v_cvt_f32_f16_e32 v60, v4
-; SI-NEXT: v_mov_b32_e32 v18, v3
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v7, v57
+; SI-NEXT: v_mov_b32_e32 v57, v3
; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v24, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v52, v13
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: .LBB81_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v3, v62
@@ -145123,7 +145160,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v63
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0
@@ -145136,13 +145173,13 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v14
; SI-NEXT: v_cvt_f16_f32_e32 v3, v34
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v45
; SI-NEXT: v_cvt_f16_f32_e32 v3, v47
; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -145156,28 +145193,28 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v20
; SI-NEXT: v_cvt_f16_f32_e32 v3, v30
; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v17
; SI-NEXT: v_cvt_f16_f32_e32 v3, v26
; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v28
; SI-NEXT: v_cvt_f16_f32_e32 v3, v53
; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -145205,7 +145242,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v3, v37
; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -145227,111 +145264,111 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v6
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v61
; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v6
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v63
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v59
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v57
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v44
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
; SI-NEXT: v_cvt_f16_f32_e32 v3, v43
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v55
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
; SI-NEXT: v_cvt_f16_f32_e32 v3, v51
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_cvt_f16_f32_e32 v3, v39
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_cvt_f16_f32_e32 v3, v19
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v27
; SI-NEXT: v_cvt_f16_f32_e32 v3, v11
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v23
; SI-NEXT: v_cvt_f16_f32_e32 v3, v18
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v16
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -145367,24 +145404,24 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr49
@@ -145392,39 +145429,39 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr3
@@ -149698,24 +149735,24 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21
@@ -149736,23 +149773,23 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -149765,45 +149802,46 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB87_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v7, v0, v61
+; SI-NEXT: v_or_b32_e32 v7, v0, v58
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v9, v0, v50
+; SI-NEXT: v_or_b32_e32 v9, v0, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v10, v0, v43
+; SI-NEXT: v_or_b32_e32 v10, v0, v50
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT: v_or_b32_e32 v11, v0, v41
+; SI-NEXT: v_or_b32_e32 v11, v0, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT: v_or_b32_e32 v12, v0, v40
+; SI-NEXT: v_or_b32_e32 v12, v0, v41
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT: v_mov_b32_e32 v52, v57
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
+; SI-NEXT: v_mov_b32_e32 v36, v41
+; SI-NEXT: v_mov_b32_e32 v41, v13
; SI-NEXT: v_or_b32_e32 v13, v0, v13
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v41, v14
-; SI-NEXT: v_or_b32_e32 v14, v0, v48
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v43
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
+; SI-NEXT: v_mov_b32_e32 v50, v45
+; SI-NEXT: v_mov_b32_e32 v45, v14
+; SI-NEXT: v_or_b32_e32 v14, v0, v40
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16
+; SI-NEXT: v_mov_b32_e32 v52, v57
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
; SI-NEXT: v_or_b32_e32 v15, v0, v15
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_or_b32_e32 v16, v0, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_or_b32_e32 v17, v0, v17
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22
; SI-NEXT: s_waitcnt expcnt(0)
@@ -149837,7 +149875,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_and_b32 s6, s20, 0xffff
; SI-NEXT: s_lshl_b32 s7, s21, 16
; SI-NEXT: v_or_b32_e32 v26, v0, v26
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: s_and_b32 s7, s22, 0xffff
; SI-NEXT: s_lshl_b32 s8, s23, 16
@@ -149848,7 +149886,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_lshl_b32 s9, s25, 16
; SI-NEXT: v_mov_b32_e32 v33, v28
; SI-NEXT: v_or_b32_e32 v28, v0, v5
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43
; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s9, s26, 0xffff
; SI-NEXT: s_lshl_b32 s10, s27, 16
@@ -149860,7 +149898,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_lshl_b32 s11, s29, 16
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v30, v0, v3
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v32, v55
@@ -149868,9 +149906,9 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
-; SI-NEXT: v_or_b32_e32 v31, v0, v34
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
+; SI-NEXT: v_or_b32_e32 v31, v0, v48
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
@@ -149880,12 +149918,13 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_mov_b32_e32 v6, s10
; SI-NEXT: s_cbranch_execnz .LBB87_3
; SI-NEXT: .LBB87_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v32, v1
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v38, v43
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: s_and_b32 s4, s16, 0xffff
; SI-NEXT: s_lshl_b32 s5, s17, 16
@@ -149929,42 +149968,42 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v38, v0
+; SI-NEXT: v_or_b32_e32 v0, v34, v0
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v51, v0
+; SI-NEXT: v_or_b32_e32 v0, v58, v0
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v50, v0
+; SI-NEXT: v_or_b32_e32 v0, v51, v0
; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v36, v0
+; SI-NEXT: v_or_b32_e32 v0, v50, v0
; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v0, v36, v0
; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v49, v0
+; SI-NEXT: v_or_b32_e32 v0, v41, v0
; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v43, v0
+; SI-NEXT: v_or_b32_e32 v0, v49, v0
; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v48, v0
+; SI-NEXT: v_or_b32_e32 v0, v40, v0
; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -150026,7 +150065,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -150042,12 +150081,12 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
@@ -150056,7 +150095,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -150091,26 +150130,26 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB87_4:
-; SI-NEXT: v_mov_b32_e32 v38, v61
+; SI-NEXT: v_mov_b32_e32 v34, v58
; SI-NEXT: v_mov_b32_e32 v32, v55
+; SI-NEXT: v_mov_b32_e32 v58, v61
; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_mov_b32_e32 v53, v6
; SI-NEXT: v_mov_b32_e32 v52, v57
; SI-NEXT: v_mov_b32_e32 v51, v50
; SI-NEXT: v_mov_b32_e32 v61, v56
-; SI-NEXT: v_mov_b32_e32 v50, v43
+; SI-NEXT: v_mov_b32_e32 v50, v45
; SI-NEXT: v_mov_b32_e32 v36, v41
-; SI-NEXT: v_mov_b32_e32 v57, v40
-; SI-NEXT: v_mov_b32_e32 v40, v49
-; SI-NEXT: v_mov_b32_e32 v49, v13
-; SI-NEXT: v_mov_b32_e32 v43, v48
-; SI-NEXT: v_mov_b32_e32 v48, v15
-; SI-NEXT: v_mov_b32_e32 v41, v14
+; SI-NEXT: v_mov_b32_e32 v41, v13
+; SI-NEXT: v_mov_b32_e32 v57, v49
+; SI-NEXT: v_mov_b32_e32 v49, v40
+; SI-NEXT: v_mov_b32_e32 v40, v15
+; SI-NEXT: v_mov_b32_e32 v45, v14
; SI-NEXT: v_mov_b32_e32 v56, v16
; SI-NEXT: v_mov_b32_e32 v47, v46
-; SI-NEXT: v_mov_b32_e32 v45, v44
-; SI-NEXT: v_mov_b32_e32 v59, v42
+; SI-NEXT: v_mov_b32_e32 v59, v44
+; SI-NEXT: v_mov_b32_e32 v43, v42
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mov_b32_e32 v37, v20
; SI-NEXT: v_mov_b32_e32 v39, v23
@@ -157017,6 +157056,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
@@ -157027,15 +157067,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
-; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT: s_mov_b32 s72, s21
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v43, s19, 0
-; SI-NEXT: v_writelane_b32 v43, s18, 1
-; SI-NEXT: v_writelane_b32 v43, s17, 2
-; SI-NEXT: v_writelane_b32 v43, s16, 3
-; SI-NEXT: s_mov_b32 s60, s24
+; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_writelane_b32 v41, s30, 0
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_writelane_b32 v44, s29, 0
+; SI-NEXT: v_writelane_b32 v44, s28, 1
+; SI-NEXT: v_writelane_b32 v44, s27, 2
+; SI-NEXT: v_writelane_b32 v44, s26, 3
+; SI-NEXT: v_writelane_b32 v44, s19, 4
+; SI-NEXT: v_writelane_b32 v44, s18, 5
+; SI-NEXT: v_writelane_b32 v44, s17, 6
+; SI-NEXT: v_writelane_b32 v44, s16, 7
; SI-NEXT: v_writelane_b32 v41, s31, 1
; SI-NEXT: v_writelane_b32 v41, s34, 2
; SI-NEXT: v_writelane_b32 v41, s35, 3
@@ -157059,8 +157102,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s69, 21
; SI-NEXT: v_writelane_b32 v41, s70, 22
; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: s_mov_b32 s77, s28
-; SI-NEXT: s_mov_b32 s76, s27
; SI-NEXT: v_writelane_b32 v41, s80, 24
; SI-NEXT: v_writelane_b32 v41, s81, 25
; SI-NEXT: v_writelane_b32 v41, s82, 26
@@ -157071,100 +157112,92 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
; SI-NEXT: v_writelane_b32 v41, s97, 33
+; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s30, v16
+; SI-NEXT: v_readfirstlane_b32 s31, v15
+; SI-NEXT: v_readfirstlane_b32 s34, v21
+; SI-NEXT: v_readfirstlane_b32 s35, v22
+; SI-NEXT: v_readfirstlane_b32 s36, v20
+; SI-NEXT: v_readfirstlane_b32 s37, v19
+; SI-NEXT: v_readfirstlane_b32 s38, v25
+; SI-NEXT: v_readfirstlane_b32 s39, v26
+; SI-NEXT: v_readfirstlane_b32 s48, v24
+; SI-NEXT: v_readfirstlane_b32 s49, v23
+; SI-NEXT: v_readfirstlane_b32 s50, v29
+; SI-NEXT: v_readfirstlane_b32 s51, v30
+; SI-NEXT: v_readfirstlane_b32 s52, v28
+; SI-NEXT: v_readfirstlane_b32 s53, v27
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
-; SI-NEXT: s_mov_b32 s79, s26
-; SI-NEXT: v_readfirstlane_b32 s38, v20
-; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT: v_readfirstlane_b32 s39, v19
-; SI-NEXT: v_writelane_b32 v42, s38, 0
-; SI-NEXT: v_readfirstlane_b32 s48, v25
-; SI-NEXT: v_writelane_b32 v42, s39, 1
-; SI-NEXT: v_readfirstlane_b32 s49, v26
-; SI-NEXT: v_writelane_b32 v42, s48, 2
-; SI-NEXT: v_readfirstlane_b32 s50, v24
-; SI-NEXT: v_writelane_b32 v42, s49, 3
-; SI-NEXT: v_readfirstlane_b32 s51, v23
-; SI-NEXT: v_writelane_b32 v42, s50, 4
-; SI-NEXT: v_readfirstlane_b32 s52, v29
-; SI-NEXT: v_writelane_b32 v42, s51, 5
-; SI-NEXT: v_readfirstlane_b32 s53, v30
-; SI-NEXT: v_writelane_b32 v42, s52, 6
-; SI-NEXT: v_readfirstlane_b32 s54, v28
-; SI-NEXT: v_writelane_b32 v42, s53, 7
-; SI-NEXT: v_readfirstlane_b32 s55, v27
-; SI-NEXT: v_writelane_b32 v42, s54, 8
-; SI-NEXT: v_writelane_b32 v42, s55, 9
+; SI-NEXT: s_mov_b32 s6, s21
; SI-NEXT: v_readfirstlane_b32 s16, v1
; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
-; SI-NEXT: v_readfirstlane_b32 s88, v4
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_readfirstlane_b32 s90, v9
+; SI-NEXT: v_readfirstlane_b32 s78, v4
+; SI-NEXT: v_readfirstlane_b32 s79, v3
+; SI-NEXT: v_readfirstlane_b32 s88, v9
+; SI-NEXT: v_readfirstlane_b32 s89, v10
+; SI-NEXT: v_readfirstlane_b32 s90, v8
+; SI-NEXT: v_readfirstlane_b32 s91, v7
+; SI-NEXT: v_readfirstlane_b32 s92, v13
+; SI-NEXT: v_readfirstlane_b32 s93, v14
+; SI-NEXT: v_readfirstlane_b32 s94, v12
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s6, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
+; SI-NEXT: v_writelane_b32 v44, s4, 8
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v43, s4, 4
+; SI-NEXT: v_writelane_b32 v44, s4, 9
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT: v_writelane_b32 v43, s4, 5
+; SI-NEXT: v_writelane_b32 v44, s4, 10
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v43, s4, 6
+; SI-NEXT: v_writelane_b32 v44, s4, 11
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v43, s4, 7
+; SI-NEXT: v_writelane_b32 v44, s4, 12
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v43, s4, 8
+; SI-NEXT: v_writelane_b32 v44, s4, 13
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
+; SI-NEXT: v_writelane_b32 v44, s4, 14
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT: v_writelane_b32 v43, s4, 9
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v43, s4, 10
-; SI-NEXT: v_readfirstlane_b32 s91, v10
-; SI-NEXT: v_readfirstlane_b32 s92, v8
-; SI-NEXT: v_readfirstlane_b32 s93, v7
-; SI-NEXT: v_readfirstlane_b32 s94, v13
-; SI-NEXT: v_readfirstlane_b32 s95, v14
-; SI-NEXT: v_readfirstlane_b32 s30, v17
-; SI-NEXT: v_readfirstlane_b32 s31, v18
-; SI-NEXT: v_readfirstlane_b32 s34, v16
-; SI-NEXT: v_readfirstlane_b32 s35, v15
-; SI-NEXT: v_readfirstlane_b32 s36, v21
-; SI-NEXT: v_readfirstlane_b32 s37, v22
+; SI-NEXT: v_writelane_b32 v44, s4, 15
+; SI-NEXT: v_readfirstlane_b32 s95, v11
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v43, s4, 11
+; SI-NEXT: v_writelane_b32 v44, s4, 16
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v43, s4, 12
+; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v43, s4, 13
+; SI-NEXT: v_writelane_b32 v44, s4, 18
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v43, s4, 14
+; SI-NEXT: v_writelane_b32 v44, s4, 19
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v43, s4, 15
+; SI-NEXT: v_writelane_b32 v44, s4, 20
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v51
+; SI-NEXT: v_writelane_b32 v44, s4, 21
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244
@@ -157173,39 +157206,49 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s75, v32
+; SI-NEXT: v_readfirstlane_b32 s4, v32
+; SI-NEXT: v_writelane_b32 v44, s4, 22
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s61, v33
+; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v43, s4, 16
+; SI-NEXT: v_writelane_b32 v44, s4, 23
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s43, v34
+; SI-NEXT: v_readfirstlane_b32 s4, v34
+; SI-NEXT: v_writelane_b32 v44, s4, 24
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s40, v35
+; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_writelane_b32 v44, s4, 25
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
+; SI-NEXT: v_writelane_b32 v44, s4, 26
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s63, v37
+; SI-NEXT: v_readfirstlane_b32 s4, v37
+; SI-NEXT: v_writelane_b32 v44, s4, 27
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v43, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s59, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_writelane_b32 v44, s4, 28
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s42, v38
+; SI-NEXT: v_readfirstlane_b32 s4, v38
+; SI-NEXT: v_writelane_b32 v44, s4, 29
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s73, v39
+; SI-NEXT: v_readfirstlane_b32 s4, v39
+; SI-NEXT: v_writelane_b32 v44, s4, 30
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s21, v48
+; SI-NEXT: v_readfirstlane_b32 s4, v48
+; SI-NEXT: v_writelane_b32 v44, s4, 31
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s57, v49
+; SI-NEXT: v_readfirstlane_b32 s4, v49
+; SI-NEXT: v_writelane_b32 v44, s4, 32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s13, v50
+; SI-NEXT: v_readfirstlane_b32 s4, v50
+; SI-NEXT: v_writelane_b32 v44, s4, 33
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s45, v51
+; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -157213,51 +157256,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s47, v32
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s24, v33
+; SI-NEXT: v_readfirstlane_b32 s58, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
+; SI-NEXT: v_readfirstlane_b32 s26, v32
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_readfirstlane_b32 s77, v34
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_readfirstlane_b32 s63, v35
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_readfirstlane_b32 s57, v36
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_readfirstlane_b32 s56, v37
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s78, v34
-; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v43, s4, 18
-; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v43, s4, 19
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s4, v37
-; SI-NEXT: v_writelane_b32 v43, s4, 20
+; SI-NEXT: v_writelane_b32 v44, s4, 34
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v43, s4, 21
+; SI-NEXT: v_readfirstlane_b32 s61, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v43, s4, 22
+; SI-NEXT: v_readfirstlane_b32 s74, v38
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v43, s4, 23
+; SI-NEXT: v_readfirstlane_b32 s76, v39
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v43, s4, 24
+; SI-NEXT: v_readfirstlane_b32 s47, v48
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v43, s4, 25
+; SI-NEXT: v_readfirstlane_b32 s45, v49
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v43, s4, 26
+; SI-NEXT: v_readfirstlane_b32 s60, v50
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s4, v51
-; SI-NEXT: v_writelane_b32 v43, s4, 27
+; SI-NEXT: v_readfirstlane_b32 s42, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s4, v33
+; SI-NEXT: v_readfirstlane_b32 s13, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132
@@ -157269,43 +157304,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT: v_writelane_b32 v43, s4, 28
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s4, v52
-; SI-NEXT: v_writelane_b32 v43, s4, 29
-; SI-NEXT: v_readfirstlane_b32 s4, v53
-; SI-NEXT: v_writelane_b32 v43, s4, 30
-; SI-NEXT: v_readfirstlane_b32 s4, v54
-; SI-NEXT: v_writelane_b32 v43, s4, 31
-; SI-NEXT: v_readfirstlane_b32 s4, v55
-; SI-NEXT: v_writelane_b32 v43, s4, 32
+; SI-NEXT: v_readfirstlane_b32 s72, v52
+; SI-NEXT: v_readfirstlane_b32 s73, v53
+; SI-NEXT: v_readfirstlane_b32 s44, v55
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v40
-; SI-NEXT: v_writelane_b32 v43, s4, 33
-; SI-NEXT: v_writelane_b32 v43, s22, 34
-; SI-NEXT: v_writelane_b32 v43, s23, 35
-; SI-NEXT: v_writelane_b32 v43, s72, 36
-; SI-NEXT: v_writelane_b32 v43, s20, 37
-; SI-NEXT: v_writelane_b32 v43, s79, 38
-; SI-NEXT: v_writelane_b32 v43, s76, 39
-; SI-NEXT: v_writelane_b32 v43, s25, 40
-; SI-NEXT: v_writelane_b32 v43, s60, 41
-; SI-NEXT: v_writelane_b32 v43, s29, 42
-; SI-NEXT: v_writelane_b32 v43, s77, 43
-; SI-NEXT: v_writelane_b32 v43, s16, 44
-; SI-NEXT: v_writelane_b32 v43, s17, 45
-; SI-NEXT: v_writelane_b32 v43, s18, 46
-; SI-NEXT: v_writelane_b32 v43, s19, 47
-; SI-NEXT: v_writelane_b32 v43, s88, 48
-; SI-NEXT: v_writelane_b32 v43, s89, 49
-; SI-NEXT: v_writelane_b32 v43, s90, 50
-; SI-NEXT: v_writelane_b32 v43, s91, 51
-; SI-NEXT: v_writelane_b32 v43, s92, 52
-; SI-NEXT: v_writelane_b32 v43, s93, 53
-; SI-NEXT: v_writelane_b32 v43, s94, 54
-; SI-NEXT: v_writelane_b32 v43, s95, 55
+; SI-NEXT: v_writelane_b32 v44, s4, 35
+; SI-NEXT: v_writelane_b32 v44, s22, 36
+; SI-NEXT: v_writelane_b32 v44, s23, 37
+; SI-NEXT: v_writelane_b32 v44, s6, 38
+; SI-NEXT: v_writelane_b32 v44, s20, 39
+; SI-NEXT: v_writelane_b32 v44, s25, 40
+; SI-NEXT: v_writelane_b32 v44, s24, 41
+; SI-NEXT: v_writelane_b32 v44, s44, 42
+; SI-NEXT: v_writelane_b32 v44, s72, 43
+; SI-NEXT: v_writelane_b32 v44, s13, 44
+; SI-NEXT: v_writelane_b32 v44, s60, 45
+; SI-NEXT: v_writelane_b32 v44, s73, 46
+; SI-NEXT: v_readfirstlane_b32 s21, v54
+; SI-NEXT: v_writelane_b32 v44, s42, 47
+; SI-NEXT: v_writelane_b32 v44, s21, 48
+; SI-NEXT: v_writelane_b32 v44, s16, 49
+; SI-NEXT: v_writelane_b32 v44, s17, 50
+; SI-NEXT: v_writelane_b32 v44, s18, 51
+; SI-NEXT: v_writelane_b32 v44, s19, 52
+; SI-NEXT: v_writelane_b32 v44, s78, 53
+; SI-NEXT: v_writelane_b32 v44, s79, 54
+; SI-NEXT: v_writelane_b32 v44, s88, 55
+; SI-NEXT: v_writelane_b32 v44, s89, 56
+; SI-NEXT: v_writelane_b32 v44, s90, 57
+; SI-NEXT: v_writelane_b32 v44, s91, 58
+; SI-NEXT: v_writelane_b32 v44, s92, 59
+; SI-NEXT: v_writelane_b32 v44, s93, 60
+; SI-NEXT: v_writelane_b32 v44, s94, 61
+; SI-NEXT: v_writelane_b32 v44, s95, 62
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s62, v33
+; SI-NEXT: v_readfirstlane_b32 s40, v33
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s10, v34
; SI-NEXT: s_waitcnt vmcnt(8)
@@ -157313,13 +157348,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s28, v31
; SI-NEXT: v_readfirstlane_b32 s27, v32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s58, v36
+; SI-NEXT: v_readfirstlane_b32 s29, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s69, v37
+; SI-NEXT: v_readfirstlane_b32 s70, v37
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s14, v38
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s68, v39
+; SI-NEXT: v_readfirstlane_b32 s69, v39
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
@@ -157334,42 +157369,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s11, v49
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s70, v50
+; SI-NEXT: v_readfirstlane_b32 s71, v50
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s71, v51
+; SI-NEXT: v_readfirstlane_b32 s81, v51
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12
-; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11
-; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56
-; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57
-; SI-NEXT: v_writelane_b32 v43, s30, 58
-; SI-NEXT: v_writelane_b32 v43, s31, 59
-; SI-NEXT: v_writelane_b32 v43, s34, 60
-; SI-NEXT: v_writelane_b32 v43, s35, 61
-; SI-NEXT: v_writelane_b32 v43, s36, 62
-; SI-NEXT: v_writelane_b32 v43, s37, 63
+; SI-NEXT: v_readfirstlane_b32 vcc_hi, v18
+; SI-NEXT: v_writelane_b32 v43, vcc_hi, 0
+; SI-NEXT: v_writelane_b32 v43, s30, 1
+; SI-NEXT: v_writelane_b32 v43, s31, 2
+; SI-NEXT: v_writelane_b32 v43, s34, 3
+; SI-NEXT: v_writelane_b32 v43, s35, 4
+; SI-NEXT: v_writelane_b32 v43, s36, 5
+; SI-NEXT: v_writelane_b32 v43, s37, 6
+; SI-NEXT: v_writelane_b32 v43, s38, 7
+; SI-NEXT: v_writelane_b32 v43, s39, 8
+; SI-NEXT: v_writelane_b32 v43, s48, 9
+; SI-NEXT: v_writelane_b32 v43, s49, 10
+; SI-NEXT: v_writelane_b32 v43, s50, 11
+; SI-NEXT: v_writelane_b32 v43, s51, 12
+; SI-NEXT: v_writelane_b32 v43, s52, 13
+; SI-NEXT: v_writelane_b32 v43, s53, 14
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v17
+; SI-NEXT: v_writelane_b32 v44, vcc_lo, 63
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s74, v31
+; SI-NEXT: v_readfirstlane_b32 s46, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s46, v32
+; SI-NEXT: v_readfirstlane_b32 s59, v32
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s96, v33
+; SI-NEXT: v_readfirstlane_b32 s83, v33
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s98, v34
+; SI-NEXT: v_readfirstlane_b32 s12, v34
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s41, v35
+; SI-NEXT: v_readfirstlane_b32 s97, v35
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s56, v36
+; SI-NEXT: v_readfirstlane_b32 s8, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s87, v37
+; SI-NEXT: v_readfirstlane_b32 s84, v37
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s99, v38
+; SI-NEXT: v_readfirstlane_b32 s86, v38
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s81, v39
+; SI-NEXT: v_readfirstlane_b32 s15, v39
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
@@ -157379,415 +157422,417 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s26, v48
+; SI-NEXT: v_readfirstlane_b32 s62, v48
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s83, v49
+; SI-NEXT: v_readfirstlane_b32 s96, v49
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s82, v50
+; SI-NEXT: v_readfirstlane_b32 s7, v50
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s7, v51
+; SI-NEXT: v_readfirstlane_b32 s80, v51
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s15, v31
+; SI-NEXT: v_readfirstlane_b32 s41, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s97, v32
+; SI-NEXT: v_readfirstlane_b32 s98, v32
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s44, v33
+; SI-NEXT: v_readfirstlane_b32 s99, v33
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s9, v34
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s80, v35
+; SI-NEXT: v_readfirstlane_b32 s82, v35
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s86, v36
+; SI-NEXT: v_readfirstlane_b32 s68, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s85, v37
+; SI-NEXT: v_readfirstlane_b32 s67, v37
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s8, v38
+; SI-NEXT: v_readfirstlane_b32 s85, v38
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s12, v39
+; SI-NEXT: v_readfirstlane_b32 s87, v39
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_readfirstlane_b32 s65, v48
+; SI-NEXT: v_readfirstlane_b32 s55, v48
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_readfirstlane_b32 s64, v49
-; SI-NEXT: v_writelane_b32 v42, s64, 10
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_readfirstlane_b32 s67, v50
-; SI-NEXT: v_writelane_b32 v42, s65, 11
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s84, v51
-; SI-NEXT: v_writelane_b32 v42, s67, 12
-; SI-NEXT: v_writelane_b32 v42, s84, 13
-; SI-NEXT: v_writelane_b32 v42, s85, 14
-; SI-NEXT: v_writelane_b32 v42, s86, 15
-; SI-NEXT: v_writelane_b32 v42, s87, 16
-; SI-NEXT: v_writelane_b32 v42, s8, 17
-; SI-NEXT: v_writelane_b32 v42, s99, 18
-; SI-NEXT: v_writelane_b32 v42, s12, 19
-; SI-NEXT: v_writelane_b32 v42, s44, 20
-; SI-NEXT: v_writelane_b32 v42, s97, 21
-; SI-NEXT: v_writelane_b32 v42, s83, 22
-; SI-NEXT: v_writelane_b32 v42, s82, 23
-; SI-NEXT: v_writelane_b32 v42, s98, 24
-; SI-NEXT: v_writelane_b32 v42, s96, 25
-; SI-NEXT: v_writelane_b32 v42, s81, 26
-; SI-NEXT: v_writelane_b32 v42, s9, 27
-; SI-NEXT: v_writelane_b32 v42, s41, 28
-; SI-NEXT: v_writelane_b32 v42, s80, 29
-; SI-NEXT: v_writelane_b32 v42, s7, 30
-; SI-NEXT: v_writelane_b32 v42, s56, 31
-; SI-NEXT: v_writelane_b32 v42, s26, 32
-; SI-NEXT: v_writelane_b32 v42, s15, 33
-; SI-NEXT: v_writelane_b32 v42, s14, 34
-; SI-NEXT: v_writelane_b32 v42, s69, 35
-; SI-NEXT: v_writelane_b32 v42, s71, 36
-; SI-NEXT: v_writelane_b32 v42, s70, 37
-; SI-NEXT: v_writelane_b32 v42, s68, 38
-; SI-NEXT: v_writelane_b32 v42, s74, 39
-; SI-NEXT: v_writelane_b32 v42, s46, 40
-; SI-NEXT: v_writelane_b32 v42, s11, 41
-; SI-NEXT: v_writelane_b32 v42, s10, 42
-; SI-NEXT: v_writelane_b32 v42, s62, 43
-; SI-NEXT: v_writelane_b32 v42, s66, 44
-; SI-NEXT: v_writelane_b32 v42, s58, 45
-; SI-NEXT: v_writelane_b32 v42, s28, 46
-; SI-NEXT: v_writelane_b32 v42, s27, 47
-; SI-NEXT: v_writelane_b32 v42, s78, 48
-; SI-NEXT: v_writelane_b32 v42, s24, 49
+; SI-NEXT: v_readfirstlane_b32 s54, v49
+; SI-NEXT: v_writelane_b32 v43, s54, 15
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_readfirstlane_b32 s64, v50
+; SI-NEXT: v_writelane_b32 v43, s55, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s65, v51
+; SI-NEXT: v_writelane_b32 v43, s64, 17
+; SI-NEXT: v_writelane_b32 v43, s65, 18
+; SI-NEXT: v_writelane_b32 v43, s67, 19
+; SI-NEXT: v_writelane_b32 v43, s68, 20
+; SI-NEXT: v_writelane_b32 v43, s84, 21
+; SI-NEXT: v_writelane_b32 v43, s85, 22
+; SI-NEXT: v_writelane_b32 v43, s86, 23
+; SI-NEXT: v_writelane_b32 v43, s87, 24
+; SI-NEXT: v_writelane_b32 v43, s99, 25
+; SI-NEXT: v_writelane_b32 v43, s98, 26
+; SI-NEXT: v_writelane_b32 v43, s96, 27
+; SI-NEXT: v_writelane_b32 v43, s7, 28
+; SI-NEXT: v_writelane_b32 v43, s12, 29
+; SI-NEXT: v_writelane_b32 v43, s83, 30
+; SI-NEXT: v_writelane_b32 v43, s15, 31
+; SI-NEXT: v_writelane_b32 v43, s9, 32
+; SI-NEXT: v_writelane_b32 v43, s97, 33
+; SI-NEXT: v_writelane_b32 v43, s82, 34
+; SI-NEXT: v_writelane_b32 v43, s80, 35
+; SI-NEXT: v_writelane_b32 v43, s8, 36
+; SI-NEXT: v_writelane_b32 v43, s62, 37
+; SI-NEXT: v_writelane_b32 v43, s41, 38
+; SI-NEXT: v_writelane_b32 v43, s14, 39
+; SI-NEXT: v_writelane_b32 v43, s70, 40
+; SI-NEXT: v_writelane_b32 v43, s81, 41
+; SI-NEXT: v_writelane_b32 v43, s71, 42
+; SI-NEXT: v_writelane_b32 v43, s69, 43
+; SI-NEXT: v_writelane_b32 v43, s46, 44
+; SI-NEXT: v_writelane_b32 v43, s59, 45
+; SI-NEXT: v_writelane_b32 v43, s11, 46
+; SI-NEXT: v_writelane_b32 v43, s10, 47
+; SI-NEXT: v_writelane_b32 v43, s40, 48
+; SI-NEXT: v_writelane_b32 v43, s66, 49
+; SI-NEXT: v_writelane_b32 v43, s29, 50
+; SI-NEXT: v_writelane_b32 v43, s28, 51
+; SI-NEXT: v_writelane_b32 v43, s27, 52
+; SI-NEXT: v_writelane_b32 v43, s45, 53
+; SI-NEXT: v_writelane_b32 v43, s47, 54
+; SI-NEXT: v_writelane_b32 v43, s61, 55
; SI-NEXT: s_cbranch_scc0 .LBB89_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_readlane_b32 s4, v43, 3
+; SI-NEXT: v_readlane_b32 s4, v44, 7
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: v_readlane_b32 s5, v43, 2
+; SI-NEXT: v_readlane_b32 s5, v44, 6
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_writelane_b32 v42, s4, 56
-; SI-NEXT: v_readlane_b32 s4, v43, 1
+; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT: v_readlane_b32 s5, v44, 4
+; SI-NEXT: v_writelane_b32 v42, s4, 0
+; SI-NEXT: v_readlane_b32 s4, v44, 5
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: v_readlane_b32 s5, v43, 0
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_writelane_b32 v42, s4, 57
+; SI-NEXT: v_writelane_b32 v42, s4, 1
; SI-NEXT: s_and_b32 s4, s20, 0xff
-; SI-NEXT: s_lshl_b32 s5, s72, 8
-; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_lshl_b32 s5, s6, 8
+; SI-NEXT: s_or_b32 s43, s4, s5
; SI-NEXT: s_and_b32 s5, s22, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_mov_b32 s22, s6
; SI-NEXT: s_lshl_b32 s6, s23, 24
-; SI-NEXT: v_writelane_b32 v42, s4, 58
; SI-NEXT: s_or_b32 s4, s6, s5
-; SI-NEXT: s_and_b32 s5, s60, 0xff
+; SI-NEXT: s_and_b32 s5, s24, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_lshl_b32 s6, s25, 24
-; SI-NEXT: v_writelane_b32 v42, s4, 59
-; SI-NEXT: s_or_b32 s5, s6, s5
-; SI-NEXT: v_writelane_b32 v42, s5, 60
-; SI-NEXT: s_and_b32 s5, s79, 0xff
+; SI-NEXT: v_writelane_b32 v42, s4, 2
+; SI-NEXT: s_or_b32 s4, s6, s5
+; SI-NEXT: v_readlane_b32 s5, v44, 3
+; SI-NEXT: s_and_b32 s5, s5, 0xff
+; SI-NEXT: v_readlane_b32 s6, v44, 2
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s6, s76, 24
-; SI-NEXT: s_or_b32 s5, s6, s5
-; SI-NEXT: v_writelane_b32 v42, s5, 61
-; SI-NEXT: s_and_b32 s5, s77, 0xff
-; SI-NEXT: s_lshl_b32 s6, s29, 8
-; SI-NEXT: s_or_b32 s5, s5, s6
+; SI-NEXT: s_lshl_b32 s6, s6, 24
+; SI-NEXT: s_or_b32 s25, s6, s5
+; SI-NEXT: v_readlane_b32 s5, v44, 1
+; SI-NEXT: v_readlane_b32 s6, v44, 0
+; SI-NEXT: s_and_b32 s5, s5, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 8
+; SI-NEXT: s_or_b32 s24, s5, s6
; SI-NEXT: s_and_b32 s6, s16, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s17, 24
-; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: v_writelane_b32 v42, s6, 62
-; SI-NEXT: s_and_b32 s6, s89, 0xff
+; SI-NEXT: v_writelane_b32 v42, s4, 3
+; SI-NEXT: s_or_b32 s4, s16, s6
+; SI-NEXT: s_and_b32 s6, s79, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s16, s88, 24
-; SI-NEXT: s_mov_b32 s4, s47
-; SI-NEXT: s_or_b32 s47, s16, s6
+; SI-NEXT: s_lshl_b32 s16, s78, 24
+; SI-NEXT: s_or_b32 s5, s16, s6
; SI-NEXT: s_and_b32 s6, s18, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s19, 24
-; SI-NEXT: s_or_b32 s25, s16, s6
-; SI-NEXT: s_and_b32 s6, s93, 0xff
-; SI-NEXT: s_lshl_b32 s16, s92, 8
+; SI-NEXT: s_or_b32 s75, s16, s6
+; SI-NEXT: s_and_b32 s6, s91, 0xff
+; SI-NEXT: s_lshl_b32 s16, s90, 8
; SI-NEXT: s_or_b32 s6, s6, s16
-; SI-NEXT: s_and_b32 s16, s90, 0xff
+; SI-NEXT: s_and_b32 s16, s88, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_lshl_b32 s17, s91, 24
-; SI-NEXT: s_or_b32 s92, s17, s16
-; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff
+; SI-NEXT: s_lshl_b32 s17, s89, 24
+; SI-NEXT: s_or_b32 s78, s17, s16
+; SI-NEXT: s_and_b32 s16, s95, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24
-; SI-NEXT: s_or_b32 s76, s17, s16
-; SI-NEXT: s_and_b32 s16, s94, 0xff
+; SI-NEXT: s_lshl_b32 s17, s94, 24
+; SI-NEXT: s_mov_b32 s23, s21
+; SI-NEXT: s_or_b32 s21, s17, s16
+; SI-NEXT: s_and_b32 s16, s92, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_lshl_b32 s17, s95, 24
-; SI-NEXT: s_or_b32 s91, s17, s16
-; SI-NEXT: s_and_b32 s16, s35, 0xff
-; SI-NEXT: s_lshl_b32 s17, s34, 8
+; SI-NEXT: s_lshl_b32 s17, s93, 24
+; SI-NEXT: s_or_b32 s79, s17, s16
+; SI-NEXT: s_and_b32 s16, s31, 0xff
+; SI-NEXT: s_lshl_b32 s17, s30, 8
; SI-NEXT: s_or_b32 s16, s16, s17
-; SI-NEXT: s_and_b32 s17, s30, 0xff
+; SI-NEXT: s_and_b32 s17, vcc_lo, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
-; SI-NEXT: s_lshl_b32 s18, s31, 24
-; SI-NEXT: s_or_b32 s77, s18, s17
-; SI-NEXT: s_and_b32 s17, s39, 0xff
+; SI-NEXT: s_lshl_b32 s18, vcc_hi, 24
+; SI-NEXT: s_or_b32 s17, s18, s17
+; SI-NEXT: v_writelane_b32 v43, s17, 56
+; SI-NEXT: s_and_b32 s17, s37, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
-; SI-NEXT: s_lshl_b32 s18, s38, 24
-; SI-NEXT: s_or_b32 s79, s18, s17
-; SI-NEXT: s_and_b32 s17, s36, 0xff
+; SI-NEXT: s_lshl_b32 s18, s36, 24
+; SI-NEXT: s_or_b32 s17, s18, s17
+; SI-NEXT: v_writelane_b32 v43, s17, 58
+; SI-NEXT: s_and_b32 s17, s34, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
-; SI-NEXT: s_lshl_b32 s18, s37, 24
-; SI-NEXT: s_or_b32 s93, s18, s17
-; SI-NEXT: s_and_b32 s17, s51, 0xff
-; SI-NEXT: s_lshl_b32 s18, s50, 8
+; SI-NEXT: s_lshl_b32 s18, s35, 24
+; SI-NEXT: s_or_b32 s17, s18, s17
+; SI-NEXT: v_writelane_b32 v43, s17, 57
+; SI-NEXT: s_and_b32 s17, s49, 0xff
+; SI-NEXT: s_lshl_b32 s18, s48, 8
; SI-NEXT: s_or_b32 s17, s17, s18
-; SI-NEXT: s_and_b32 s18, s48, 0xff
+; SI-NEXT: s_and_b32 s18, s38, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
-; SI-NEXT: s_lshl_b32 s19, s49, 24
-; SI-NEXT: s_or_b32 s89, s19, s18
-; SI-NEXT: s_and_b32 s18, s55, 0xff
+; SI-NEXT: s_lshl_b32 s19, s39, 24
+; SI-NEXT: s_or_b32 s18, s19, s18
+; SI-NEXT: v_writelane_b32 v43, s18, 59
+; SI-NEXT: s_and_b32 s18, s53, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
-; SI-NEXT: s_lshl_b32 s19, s54, 24
-; SI-NEXT: s_or_b32 s31, s19, s18
-; SI-NEXT: s_and_b32 s18, s52, 0xff
+; SI-NEXT: s_lshl_b32 s19, s52, 24
+; SI-NEXT: s_or_b32 s18, s19, s18
+; SI-NEXT: v_writelane_b32 v43, s18, 61
+; SI-NEXT: s_and_b32 s18, s50, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
-; SI-NEXT: s_lshl_b32 s19, s53, 24
-; SI-NEXT: s_or_b32 s94, s19, s18
-; SI-NEXT: s_and_b32 s18, s84, 0xff
-; SI-NEXT: s_lshl_b32 s19, s67, 8
+; SI-NEXT: s_lshl_b32 s19, s51, 24
+; SI-NEXT: s_or_b32 s18, s19, s18
+; SI-NEXT: v_writelane_b32 v43, s18, 60
+; SI-NEXT: s_and_b32 s18, s65, 0xff
+; SI-NEXT: s_lshl_b32 s19, s64, 8
; SI-NEXT: s_or_b32 s18, s18, s19
-; SI-NEXT: s_and_b32 s19, s64, 0xff
+; SI-NEXT: s_and_b32 s19, s54, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s65, 24
-; SI-NEXT: s_or_b32 s60, s20, s19
-; SI-NEXT: s_and_b32 s19, s12, 0xff
+; SI-NEXT: s_lshl_b32 s20, s55, 24
+; SI-NEXT: s_or_b32 s19, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s19, 62
+; SI-NEXT: s_and_b32 s19, s87, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s8, 24
-; SI-NEXT: s_or_b32 s8, s20, s19
-; SI-NEXT: s_and_b32 s19, s85, 0xff
+; SI-NEXT: s_lshl_b32 s20, s85, 24
+; SI-NEXT: s_or_b32 s19, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s19, 63
+; SI-NEXT: s_and_b32 s19, s67, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s86, 24
-; SI-NEXT: s_or_b32 s12, s20, s19
-; SI-NEXT: s_and_b32 s19, s80, 0xff
+; SI-NEXT: s_lshl_b32 s20, s68, 24
+; SI-NEXT: s_or_b32 s95, s20, s19
+; SI-NEXT: s_and_b32 s19, s82, 0xff
; SI-NEXT: s_lshl_b32 s20, s9, 8
; SI-NEXT: s_or_b32 vcc_lo, s19, s20
-; SI-NEXT: s_and_b32 s19, s44, 0xff
+; SI-NEXT: s_and_b32 s19, s99, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s97, 24
-; SI-NEXT: s_or_b32 s9, s20, s19
-; SI-NEXT: s_and_b32 s19, s15, 0xff
+; SI-NEXT: s_lshl_b32 s20, s98, 24
+; SI-NEXT: s_or_b32 s30, s20, s19
+; SI-NEXT: s_and_b32 s19, s41, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: s_or_b32 s7, s20, s19
-; SI-NEXT: s_and_b32 s19, s82, 0xff
+; SI-NEXT: s_lshl_b32 s20, s80, 24
+; SI-NEXT: s_or_b32 s31, s20, s19
+; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s83, 24
-; SI-NEXT: s_or_b32 s23, s20, s19
-; SI-NEXT: s_and_b32 s19, s26, 0xff
-; SI-NEXT: s_lshl_b32 s20, s81, 8
+; SI-NEXT: s_lshl_b32 s20, s96, 24
+; SI-NEXT: s_or_b32 s34, s20, s19
+; SI-NEXT: s_and_b32 s19, s62, 0xff
+; SI-NEXT: s_lshl_b32 s20, s15, 8
; SI-NEXT: s_or_b32 vcc_hi, s19, s20
-; SI-NEXT: s_and_b32 s19, s99, 0xff
-; SI-NEXT: v_writelane_b32 v42, s9, 50
+; SI-NEXT: s_and_b32 s19, s86, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s87, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 51
-; SI-NEXT: s_or_b32 s7, s20, s19
-; SI-NEXT: s_and_b32 s19, s56, 0xff
+; SI-NEXT: s_lshl_b32 s20, s84, 24
+; SI-NEXT: s_or_b32 s35, s20, s19
+; SI-NEXT: s_and_b32 s19, s8, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s41, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 52
-; SI-NEXT: s_or_b32 s7, s20, s19
-; SI-NEXT: s_and_b32 s19, s98, 0xff
+; SI-NEXT: s_lshl_b32 s20, s97, 24
+; SI-NEXT: s_or_b32 s36, s20, s19
+; SI-NEXT: s_and_b32 s19, s12, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s96, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 54
-; SI-NEXT: s_or_b32 s7, s20, s19
-; SI-NEXT: s_and_b32 s19, s46, 0xff
-; SI-NEXT: s_lshl_b32 s20, s74, 8
+; SI-NEXT: s_lshl_b32 s20, s83, 24
+; SI-NEXT: s_or_b32 s37, s20, s19
+; SI-NEXT: s_and_b32 s19, s59, 0xff
+; SI-NEXT: s_lshl_b32 s20, s46, 8
; SI-NEXT: s_or_b32 s84, s19, s20
-; SI-NEXT: s_and_b32 s19, s71, 0xff
+; SI-NEXT: s_and_b32 s19, s81, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s70, 24
-; SI-NEXT: s_or_b32 s72, s20, s19
+; SI-NEXT: s_lshl_b32 s20, s71, 24
+; SI-NEXT: s_or_b32 s38, s20, s19
; SI-NEXT: s_and_b32 s19, s11, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s68, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 53
-; SI-NEXT: s_or_b32 s7, s20, s19
+; SI-NEXT: s_lshl_b32 s20, s69, 24
+; SI-NEXT: s_or_b32 s39, s20, s19
; SI-NEXT: s_and_b32 s19, s14, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s69, 24
-; SI-NEXT: s_or_b32 s9, s20, s19
-; SI-NEXT: s_and_b32 s19, s58, 0xff
+; SI-NEXT: s_lshl_b32 s20, s70, 24
+; SI-NEXT: s_or_b32 s48, s20, s19
+; SI-NEXT: s_and_b32 s19, s29, 0xff
; SI-NEXT: s_lshl_b32 s20, s66, 8
; SI-NEXT: s_or_b32 s85, s19, s20
; SI-NEXT: s_and_b32 s19, s10, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s62, 24
+; SI-NEXT: s_lshl_b32 s20, s40, 24
; SI-NEXT: s_or_b32 s49, s20, s19
; SI-NEXT: s_and_b32 s19, s27, 0xff
-; SI-NEXT: v_writelane_b32 v42, s9, 55
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s28, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 33
+; SI-NEXT: v_readlane_b32 s7, v44, 35
; SI-NEXT: s_or_b32 s50, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 32
+; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 31
+; SI-NEXT: s_lshl_b32 s20, s44, 24
; SI-NEXT: s_or_b32 s51, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 30
-; SI-NEXT: s_lshl_b32 s20, s9, 8
-; SI-NEXT: v_readlane_b32 s9, v43, 29
+; SI-NEXT: s_and_b32 s19, s23, 0xff
+; SI-NEXT: s_lshl_b32 s20, s73, 8
; SI-NEXT: s_or_b32 s86, s19, s20
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 28
+; SI-NEXT: s_and_b32 s19, s72, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 27
+; SI-NEXT: s_lshl_b32 s20, s13, 24
; SI-NEXT: s_or_b32 s52, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 26
+; SI-NEXT: s_and_b32 s19, s42, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 25
+; SI-NEXT: s_lshl_b32 s20, s60, 24
; SI-NEXT: s_or_b32 s53, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 24
+; SI-NEXT: s_and_b32 s19, s45, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 23
+; SI-NEXT: s_lshl_b32 s20, s47, 24
; SI-NEXT: s_or_b32 s54, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 22
-; SI-NEXT: s_lshl_b32 s20, s9, 8
-; SI-NEXT: v_readlane_b32 s9, v43, 21
+; SI-NEXT: s_and_b32 s19, s76, 0xff
+; SI-NEXT: s_lshl_b32 s20, s74, 8
; SI-NEXT: s_or_b32 s87, s19, s20
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 20
+; SI-NEXT: s_and_b32 s19, s61, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 19
+; SI-NEXT: s_lshl_b32 s20, s56, 24
; SI-NEXT: s_or_b32 s55, s20, s19
-; SI-NEXT: s_mov_b32 s58, s9
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 18
+; SI-NEXT: s_and_b32 s19, s57, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
+; SI-NEXT: s_lshl_b32 s20, s63, 24
; SI-NEXT: s_or_b32 s64, s20, s19
-; SI-NEXT: s_and_b32 s19, s78, 0xff
+; SI-NEXT: s_and_b32 s19, s77, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s24, 24
+; SI-NEXT: s_lshl_b32 s20, s58, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 34
; SI-NEXT: s_or_b32 s65, s20, s19
-; SI-NEXT: s_and_b32 s19, s4, 0xff
-; SI-NEXT: s_lshl_b32 s20, s45, 8
+; SI-NEXT: s_and_b32 s19, s26, 0xff
+; SI-NEXT: s_mov_b32 s42, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 8
+; SI-NEXT: v_readlane_b32 s7, v44, 33
+; SI-NEXT: s_mov_b32 s94, s26
; SI-NEXT: s_or_b32 s26, s19, s20
-; SI-NEXT: s_and_b32 s19, s13, 0xff
+; SI-NEXT: s_mov_b32 s47, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 32
+; SI-NEXT: s_mov_b32 s92, s56
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s57, 24
+; SI-NEXT: s_mov_b32 s56, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 31
; SI-NEXT: s_or_b32 s66, s20, s19
-; SI-NEXT: s_and_b32 s19, s21, 0xff
+; SI-NEXT: s_mov_b32 s61, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 30
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s73, 24
+; SI-NEXT: s_mov_b32 s60, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 29
; SI-NEXT: s_or_b32 s67, s20, s19
-; SI-NEXT: s_and_b32 s19, s42, 0xff
-; SI-NEXT: v_readlane_b32 s88, v43, 17
+; SI-NEXT: s_mov_b32 s68, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 28
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s59, 24
-; SI-NEXT: s_or_b32 s68, s20, s19
-; SI-NEXT: s_and_b32 s19, s63, 0xff
-; SI-NEXT: s_lshl_b32 s20, s88, 8
+; SI-NEXT: s_mov_b32 s59, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 27
+; SI-NEXT: s_or_b32 s45, s20, s19
+; SI-NEXT: s_mov_b32 s46, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 26
+; SI-NEXT: s_mov_b32 s69, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 8
+; SI-NEXT: v_readlane_b32 s7, v44, 25
; SI-NEXT: s_or_b32 s27, s19, s20
-; SI-NEXT: s_and_b32 s19, s40, 0xff
+; SI-NEXT: s_mov_b32 s40, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 24
+; SI-NEXT: s_mov_b32 s89, s76
+; SI-NEXT: s_mov_b32 s76, s58
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s43, 24
-; SI-NEXT: s_or_b32 s69, s20, s19
-; SI-NEXT: s_and_b32 s19, s61, 0xff
-; SI-NEXT: s_mov_b32 s39, s57
-; SI-NEXT: s_mov_b32 s57, s7
+; SI-NEXT: s_mov_b32 s58, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 23
+; SI-NEXT: s_mov_b32 s93, s74
+; SI-NEXT: s_mov_b32 s88, s57
+; SI-NEXT: s_or_b32 s57, s20, s19
+; SI-NEXT: s_mov_b32 s74, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 22
+; SI-NEXT: s_mov_b32 s90, s63
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s75, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 16
+; SI-NEXT: s_mov_b32 s63, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 21
; SI-NEXT: s_or_b32 s70, s20, s19
-; SI-NEXT: s_mov_b32 s10, s7
+; SI-NEXT: s_mov_b32 s71, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 15
+; SI-NEXT: v_readlane_b32 s7, v44, 20
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_mov_b32 s71, s7
+; SI-NEXT: s_mov_b32 s81, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 14
-; SI-NEXT: s_or_b32 s62, s20, s19
-; SI-NEXT: s_mov_b32 s15, s7
-; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 13
+; SI-NEXT: v_readlane_b32 s7, v44, 19
+; SI-NEXT: s_or_b32 s13, s20, s19
; SI-NEXT: s_mov_b32 s41, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 18
+; SI-NEXT: s_mov_b32 s14, s7
; SI-NEXT: s_lshl_b32 s20, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 12
+; SI-NEXT: v_readlane_b32 s7, v44, 17
; SI-NEXT: s_or_b32 s29, s19, s20
-; SI-NEXT: s_mov_b32 s14, s7
+; SI-NEXT: s_mov_b32 s10, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 11
+; SI-NEXT: v_readlane_b32 s7, v44, 16
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 10
+; SI-NEXT: v_readlane_b32 s7, v44, 15
; SI-NEXT: s_or_b32 s80, s20, s19
-; SI-NEXT: s_mov_b32 s56, s7
+; SI-NEXT: s_mov_b32 s8, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 9
+; SI-NEXT: v_readlane_b32 s7, v44, 14
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_mov_b32 s81, s7
+; SI-NEXT: s_mov_b32 s15, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 8
-; SI-NEXT: s_or_b32 s11, s20, s19
-; SI-NEXT: s_mov_b32 s82, s7
+; SI-NEXT: v_readlane_b32 s7, v44, 13
+; SI-NEXT: s_mov_b32 s72, s25
+; SI-NEXT: s_or_b32 s25, s20, s19
+; SI-NEXT: s_mov_b32 s83, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 7
+; SI-NEXT: v_readlane_b32 s7, v44, 12
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_mov_b32 s96, s7
+; SI-NEXT: s_mov_b32 s97, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 6
-; SI-NEXT: s_mov_b32 s36, s63
-; SI-NEXT: s_mov_b32 s63, s93
-; SI-NEXT: s_mov_b32 s93, s61
-; SI-NEXT: s_mov_b32 s61, s91
-; SI-NEXT: s_mov_b32 s91, s75
-; SI-NEXT: s_mov_b32 s75, s92
-; SI-NEXT: s_or_b32 s92, s20, s19
-; SI-NEXT: s_mov_b32 s98, s7
-; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 5
+; SI-NEXT: v_readlane_b32 s7, v44, 11
+; SI-NEXT: s_or_b32 s82, s20, s19
; SI-NEXT: s_mov_b32 s44, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 10
; SI-NEXT: s_lshl_b32 s20, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 4
-; SI-NEXT: s_mov_b32 s48, s13
-; SI-NEXT: s_mov_b32 s13, s94
-; SI-NEXT: s_mov_b32 s94, s21
+; SI-NEXT: v_readlane_b32 s12, v44, 9
+; SI-NEXT: v_readlane_b32 s22, v44, 8
+; SI-NEXT: s_mov_b32 s91, s77
+; SI-NEXT: s_mov_b32 s77, s21
; SI-NEXT: s_or_b32 s21, s19, s20
-; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: s_mov_b32 s95, s4
+; SI-NEXT: s_and_b32 s19, s12, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s22, 24
-; SI-NEXT: v_readlane_b32 s4, v42, 58
-; SI-NEXT: s_mov_b32 s46, s45
-; SI-NEXT: s_mov_b32 s34, s73
-; SI-NEXT: s_mov_b32 s73, s12
-; SI-NEXT: s_mov_b32 s37, s42
-; SI-NEXT: s_mov_b32 s38, s59
-; SI-NEXT: s_mov_b32 s59, s8
-; SI-NEXT: s_mov_b32 s30, s88
-; SI-NEXT: s_mov_b32 s88, s31
-; SI-NEXT: s_mov_b32 s78, s40
-; SI-NEXT: s_mov_b32 s31, s43
-; SI-NEXT: s_mov_b32 s12, s7
-; SI-NEXT: s_mov_b32 s7, s22
-; SI-NEXT: s_or_b32 s83, s20, s19
-; SI-NEXT: s_lshl_b32 s20, s4, 16
-; SI-NEXT: s_lshl_b32 s74, s5, 16
+; SI-NEXT: s_mov_b32 s73, s4
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s7, s12
+; SI-NEXT: s_mov_b32 s62, s22
+; SI-NEXT: s_or_b32 s28, s20, s19
+; SI-NEXT: s_lshl_b32 s20, s43, 16
+; SI-NEXT: s_lshl_b32 s23, s24, 16
; SI-NEXT: s_lshl_b32 s22, s6, 16
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s19, s17, 16
@@ -157795,50 +157840,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16
; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16
; SI-NEXT: s_lshl_b32 s99, s84, 16
-; SI-NEXT: s_lshl_b32 s8, s85, 16
-; SI-NEXT: s_lshl_b32 s97, s86, 16
-; SI-NEXT: s_lshl_b32 s28, s87, 16
+; SI-NEXT: s_lshl_b32 s98, s85, 16
+; SI-NEXT: s_lshl_b32 s12, s86, 16
+; SI-NEXT: s_lshl_b32 s96, s87, 16
; SI-NEXT: s_lshl_b32 s87, s26, 16
-; SI-NEXT: v_readlane_b32 s26, v42, 56
+; SI-NEXT: v_readlane_b32 s26, v42, 0
; SI-NEXT: s_lshl_b32 s86, s27, 16
-; SI-NEXT: v_readlane_b32 s27, v42, 57
-; SI-NEXT: v_readlane_b32 s35, v42, 61
+; SI-NEXT: v_readlane_b32 s27, v42, 1
; SI-NEXT: s_lshl_b32 s85, s29, 16
-; SI-NEXT: v_readlane_b32 s29, v42, 60
-; SI-NEXT: v_readlane_b32 s24, v42, 59
-; SI-NEXT: v_readlane_b32 s90, v42, 62
+; SI-NEXT: v_readlane_b32 s29, v42, 3
+; SI-NEXT: v_readlane_b32 s24, v42, 2
; SI-NEXT: s_lshl_b32 s84, s21, 16
-; SI-NEXT: s_mov_b32 s21, s47
+; SI-NEXT: s_mov_b32 s21, s5
; SI-NEXT: s_cbranch_execnz .LBB89_3
; SI-NEXT: .LBB89_2: ; %cmp.true
-; SI-NEXT: s_add_i32 s4, s98, 3
+; SI-NEXT: s_add_i32 s4, s44, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s5, s44, 8
-; SI-NEXT: s_add_i32 s6, s12, 3
+; SI-NEXT: s_lshl_b32 s5, s11, 8
+; SI-NEXT: s_add_i32 s6, s7, 3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: s_lshl_b32 s5, s7, 24
+; SI-NEXT: s_lshl_b32 s5, s62, 24
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_add_i32 s5, s56, 3
+; SI-NEXT: s_add_i32 s5, s8, 3
; SI-NEXT: s_and_b32 s5, s5, 0xff
-; SI-NEXT: s_lshl_b32 s6, s81, 8
-; SI-NEXT: s_add_i32 s16, s82, 3
+; SI-NEXT: s_lshl_b32 s6, s15, 8
+; SI-NEXT: s_add_i32 s16, s83, 3
; SI-NEXT: s_or_b32 s5, s6, s5
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: s_lshl_b32 s6, s96, 24
+; SI-NEXT: s_lshl_b32 s6, s97, 24
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_addk_i32 s5, 0x300
; SI-NEXT: s_or_b32 s6, s6, s16
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_or_b32 s5, s6, s5
-; SI-NEXT: s_add_i32 s6, s15, 3
+; SI-NEXT: s_add_i32 s6, s41, 3
; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: s_lshl_b32 s16, s41, 8
-; SI-NEXT: s_add_i32 s17, s14, 3
+; SI-NEXT: s_lshl_b32 s16, s14, 8
+; SI-NEXT: s_add_i32 s17, s10, 3
; SI-NEXT: s_or_b32 s6, s16, s6
; SI-NEXT: s_and_b32 s17, s17, 0xff
; SI-NEXT: s_lshl_b32 s16, s9, 24
@@ -157847,162 +157890,156 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: s_add_i32 s16, s93, 3
+; SI-NEXT: s_add_i32 s16, s74, 3
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: s_lshl_b32 s17, s91, 8
-; SI-NEXT: s_add_i32 s18, s10, 3
+; SI-NEXT: s_lshl_b32 s17, s63, 8
+; SI-NEXT: s_add_i32 s18, s71, 3
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_and_b32 s18, s18, 0xff
-; SI-NEXT: s_lshl_b32 s17, s71, 24
+; SI-NEXT: s_lshl_b32 s17, s81, 24
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: s_add_i32 s17, s36, 3
+; SI-NEXT: s_add_i32 s17, s46, 3
; SI-NEXT: s_and_b32 s17, s17, 0xff
-; SI-NEXT: s_lshl_b32 s18, s30, 8
-; SI-NEXT: s_add_i32 s19, s78, 3
+; SI-NEXT: s_lshl_b32 s18, s69, 8
+; SI-NEXT: s_add_i32 s19, s40, 3
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: s_and_b32 s19, s19, 0xff
-; SI-NEXT: s_lshl_b32 s18, s31, 24
+; SI-NEXT: s_lshl_b32 s18, s58, 24
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_addk_i32 s17, 0x300
; SI-NEXT: s_or_b32 s18, s18, s19
; SI-NEXT: s_and_b32 s17, s17, 0xffff
; SI-NEXT: s_or_b32 s17, s18, s17
-; SI-NEXT: s_add_i32 s18, s94, 3
+; SI-NEXT: s_add_i32 s18, s61, 3
; SI-NEXT: s_and_b32 s18, s18, 0xff
-; SI-NEXT: s_lshl_b32 s19, s34, 8
-; SI-NEXT: s_add_i32 s20, s37, 3
+; SI-NEXT: s_lshl_b32 s19, s60, 8
+; SI-NEXT: s_add_i32 s20, s68, 3
; SI-NEXT: s_or_b32 s18, s19, s18
; SI-NEXT: s_and_b32 s20, s20, 0xff
-; SI-NEXT: s_lshl_b32 s19, s38, 24
+; SI-NEXT: s_lshl_b32 s19, s59, 24
; SI-NEXT: s_lshl_b32 s20, s20, 16
; SI-NEXT: s_addk_i32 s18, 0x300
; SI-NEXT: s_or_b32 s19, s19, s20
; SI-NEXT: s_and_b32 s18, s18, 0xffff
; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: s_add_i32 s19, s95, 3
+; SI-NEXT: s_add_i32 s19, s94, 3
; SI-NEXT: s_and_b32 s19, s19, 0xff
-; SI-NEXT: s_lshl_b32 s20, s46, 8
-; SI-NEXT: s_add_i32 s22, s48, 3
+; SI-NEXT: s_lshl_b32 s20, s42, 8
+; SI-NEXT: s_add_i32 s22, s47, 3
; SI-NEXT: s_or_b32 s19, s20, s19
; SI-NEXT: s_and_b32 s22, s22, 0xff
-; SI-NEXT: s_lshl_b32 s20, s39, 24
+; SI-NEXT: s_lshl_b32 s20, s56, 24
; SI-NEXT: s_lshl_b32 s22, s22, 16
; SI-NEXT: s_addk_i32 s19, 0x300
; SI-NEXT: s_or_b32 s20, s20, s22
; SI-NEXT: s_and_b32 s19, s19, 0xffff
; SI-NEXT: s_or_b32 s19, s20, s19
-; SI-NEXT: s_add_i32 s20, s58, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 18
+; SI-NEXT: s_add_i32 s20, s88, 3
; SI-NEXT: s_and_b32 s20, s20, 0xff
-; SI-NEXT: s_lshl_b32 s22, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 49
+; SI-NEXT: s_lshl_b32 s22, s90, 8
+; SI-NEXT: s_add_i32 s23, s91, 3
; SI-NEXT: s_or_b32 s20, s22, s20
-; SI-NEXT: s_lshl_b32 s22, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 48
-; SI-NEXT: s_add_i32 s23, s7, 3
; SI-NEXT: s_and_b32 s23, s23, 0xff
+; SI-NEXT: s_lshl_b32 s22, s76, 24
; SI-NEXT: s_lshl_b32 s23, s23, 16
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_or_b32 s22, s22, s23
; SI-NEXT: s_and_b32 s20, s20, 0xffff
-; SI-NEXT: v_readlane_b32 s7, v43, 23
; SI-NEXT: s_or_b32 s20, s22, s20
-; SI-NEXT: s_add_i32 s22, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 22
+; SI-NEXT: s_add_i32 s22, s89, 3
+; SI-NEXT: v_readlane_b32 s7, v43, 55
; SI-NEXT: s_and_b32 s22, s22, 0xff
-; SI-NEXT: s_lshl_b32 s23, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 20
-; SI-NEXT: s_or_b32 s22, s23, s22
-; SI-NEXT: s_lshl_b32 s23, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 21
+; SI-NEXT: s_lshl_b32 s23, s93, 8
; SI-NEXT: s_add_i32 s60, s7, 3
+; SI-NEXT: s_or_b32 s22, s23, s22
; SI-NEXT: s_and_b32 s60, s60, 0xff
+; SI-NEXT: s_lshl_b32 s23, s92, 24
; SI-NEXT: s_lshl_b32 s60, s60, 16
; SI-NEXT: s_addk_i32 s22, 0x300
; SI-NEXT: s_or_b32 s23, s23, s60
; SI-NEXT: s_and_b32 s22, s22, 0xffff
-; SI-NEXT: v_readlane_b32 s7, v43, 27
+; SI-NEXT: v_readlane_b32 s7, v44, 47
; SI-NEXT: s_or_b32 s22, s23, s22
; SI-NEXT: s_add_i32 s23, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 26
+; SI-NEXT: v_readlane_b32 s7, v44, 45
; SI-NEXT: s_and_b32 s23, s23, 0xff
; SI-NEXT: s_lshl_b32 s60, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 24
+; SI-NEXT: v_readlane_b32 s7, v43, 54
; SI-NEXT: s_or_b32 s23, s60, s23
; SI-NEXT: s_lshl_b32 s60, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 25
+; SI-NEXT: v_readlane_b32 s7, v43, 53
; SI-NEXT: s_add_i32 s61, s7, 3
; SI-NEXT: s_and_b32 s61, s61, 0xff
; SI-NEXT: s_lshl_b32 s61, s61, 16
; SI-NEXT: s_addk_i32 s23, 0x300
; SI-NEXT: s_or_b32 s60, s60, s61
; SI-NEXT: s_and_b32 s23, s23, 0xffff
-; SI-NEXT: v_readlane_b32 s7, v43, 31
+; SI-NEXT: v_readlane_b32 s7, v44, 48
; SI-NEXT: s_or_b32 s23, s60, s23
; SI-NEXT: s_add_i32 s60, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 30
+; SI-NEXT: v_readlane_b32 s7, v44, 46
; SI-NEXT: s_and_b32 s60, s60, 0xff
; SI-NEXT: s_lshl_b32 s61, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 28
+; SI-NEXT: v_readlane_b32 s7, v44, 44
; SI-NEXT: s_or_b32 s60, s61, s60
; SI-NEXT: s_lshl_b32 s61, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 29
+; SI-NEXT: v_readlane_b32 s7, v44, 43
; SI-NEXT: s_add_i32 s62, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 47
+; SI-NEXT: v_readlane_b32 s7, v43, 52
; SI-NEXT: s_and_b32 s62, s62, 0xff
; SI-NEXT: s_add_i32 s59, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 46
+; SI-NEXT: v_readlane_b32 s7, v43, 51
; SI-NEXT: s_lshl_b32 s62, s62, 16
; SI-NEXT: s_addk_i32 s60, 0x300
; SI-NEXT: s_and_b32 s59, s59, 0xff
; SI-NEXT: s_lshl_b32 s58, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 32
+; SI-NEXT: v_readlane_b32 s7, v44, 42
; SI-NEXT: s_or_b32 s61, s61, s62
; SI-NEXT: s_and_b32 s60, s60, 0xffff
; SI-NEXT: s_or_b32 s58, s58, s59
; SI-NEXT: s_lshl_b32 s59, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 33
+; SI-NEXT: v_readlane_b32 s7, v44, 35
; SI-NEXT: s_or_b32 s60, s61, s60
; SI-NEXT: s_add_i32 s61, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 45
+; SI-NEXT: v_readlane_b32 s7, v43, 50
; SI-NEXT: s_add_i32 s57, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 44
+; SI-NEXT: v_readlane_b32 s7, v43, 49
; SI-NEXT: s_lshl_b32 s56, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 43
+; SI-NEXT: v_readlane_b32 s7, v43, 48
; SI-NEXT: s_lshl_b32 s47, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 42
+; SI-NEXT: v_readlane_b32 s7, v43, 47
; SI-NEXT: s_add_i32 s46, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 41
+; SI-NEXT: v_readlane_b32 s7, v43, 46
; SI-NEXT: s_add_i32 s45, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 38
+; SI-NEXT: v_readlane_b32 s7, v43, 43
; SI-NEXT: s_lshl_b32 s42, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 35
+; SI-NEXT: v_readlane_b32 s7, v43, 40
; SI-NEXT: s_lshl_b32 s15, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 34
+; SI-NEXT: v_readlane_b32 s7, v43, 39
; SI-NEXT: s_and_b32 s45, s45, 0xff
; SI-NEXT: s_add_i32 s14, s7, 3
; SI-NEXT: s_or_b32 s42, s42, s45
; SI-NEXT: s_and_b32 s14, s14, 0xff
; SI-NEXT: s_lshl_b32 s14, s14, 16
; SI-NEXT: s_addk_i32 s42, 0x300
-; SI-NEXT: v_readlane_b32 s7, v42, 40
+; SI-NEXT: v_readlane_b32 s7, v43, 45
; SI-NEXT: s_and_b32 s57, s57, 0xff
; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: s_and_b32 s15, s42, 0xffff
; SI-NEXT: s_add_i32 s44, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 39
+; SI-NEXT: v_readlane_b32 s7, v43, 44
; SI-NEXT: s_or_b32 s56, s56, s57
; SI-NEXT: s_or_b32 s57, s14, s15
; SI-NEXT: s_and_b32 s14, s44, 0xff
; SI-NEXT: s_lshl_b32 s15, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 37
+; SI-NEXT: v_readlane_b32 s7, v43, 42
; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: s_lshl_b32 s15, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 36
+; SI-NEXT: v_readlane_b32 s7, v43, 41
; SI-NEXT: s_add_i32 s40, s7, 3
; SI-NEXT: s_and_b32 s61, s61, 0xff
; SI-NEXT: s_and_b32 s40, s40, 0xff
@@ -158017,15 +158054,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s58, s59, s58
; SI-NEXT: s_or_b32 s59, s15, s14
; SI-NEXT: s_add_i32 s14, s6, 0x3000000
-; SI-NEXT: v_readlane_b32 s6, v42, 31
+; SI-NEXT: v_readlane_b32 s6, v43, 36
; SI-NEXT: s_add_i32 s11, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 28
+; SI-NEXT: v_readlane_b32 s7, v43, 33
; SI-NEXT: s_and_b32 s6, s11, 0xff
; SI-NEXT: s_lshl_b32 s8, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 25
+; SI-NEXT: v_readlane_b32 s7, v43, 30
; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: s_lshl_b32 s8, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 24
+; SI-NEXT: v_readlane_b32 s7, v43, 29
; SI-NEXT: s_add_i32 s24, s7, 3
; SI-NEXT: s_and_b32 s11, s24, 0xff
; SI-NEXT: s_addk_i32 s6, 0x300
@@ -158033,47 +158070,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s8, s8, s11
; SI-NEXT: s_or_b32 s8, s8, s6
-; SI-NEXT: v_readlane_b32 s6, v42, 32
+; SI-NEXT: v_readlane_b32 s6, v43, 37
; SI-NEXT: s_add_i32 s12, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 26
+; SI-NEXT: v_readlane_b32 s7, v43, 31
; SI-NEXT: s_and_b32 s6, s12, 0xff
; SI-NEXT: s_lshl_b32 s11, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 16
+; SI-NEXT: v_readlane_b32 s7, v43, 21
; SI-NEXT: s_or_b32 s6, s11, s6
; SI-NEXT: s_lshl_b32 s11, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 18
+; SI-NEXT: v_readlane_b32 s7, v43, 23
; SI-NEXT: s_add_i32 s12, s7, 3
; SI-NEXT: s_and_b32 s12, s12, 0xff
; SI-NEXT: s_addk_i32 s6, 0x300
; SI-NEXT: s_lshl_b32 s12, s12, 16
-; SI-NEXT: v_readlane_b32 s7, v42, 33
+; SI-NEXT: v_readlane_b32 s7, v43, 38
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s11, s11, s12
; SI-NEXT: s_add_i32 s13, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 30
+; SI-NEXT: v_readlane_b32 s7, v43, 35
; SI-NEXT: s_or_b32 s6, s11, s6
; SI-NEXT: s_and_b32 s11, s13, 0xff
; SI-NEXT: s_lshl_b32 s10, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 22
+; SI-NEXT: v_readlane_b32 s7, v43, 27
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: s_lshl_b32 s11, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 23
+; SI-NEXT: v_readlane_b32 s7, v43, 28
; SI-NEXT: s_add_i32 s25, s7, 3
; SI-NEXT: s_and_b32 s12, s25, 0xff
; SI-NEXT: s_addk_i32 s10, 0x300
; SI-NEXT: s_lshl_b32 s12, s12, 16
; SI-NEXT: s_and_b32 s10, s10, 0xffff
; SI-NEXT: s_or_b32 s11, s11, s12
-; SI-NEXT: v_readlane_b32 s7, v42, 29
+; SI-NEXT: v_readlane_b32 s7, v43, 34
; SI-NEXT: s_or_b32 s10, s11, s10
; SI-NEXT: s_add_i32 s9, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v42, 27
-; SI-NEXT: v_readlane_b32 s11, v42, 20
+; SI-NEXT: v_readlane_b32 s7, v43, 32
+; SI-NEXT: v_readlane_b32 s11, v43, 25
; SI-NEXT: s_and_b32 s9, s9, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_add_i32 s11, s11, 3
; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_readlane_b32 s9, v42, 21
+; SI-NEXT: v_readlane_b32 s9, v43, 26
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_addk_i32 s7, 0x300
; SI-NEXT: s_lshl_b32 s9, s9, 24
@@ -158081,15 +158118,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s7, s7, 0xffff
; SI-NEXT: s_or_b32 s9, s9, s11
; SI-NEXT: s_or_b32 s7, s9, s7
-; SI-NEXT: v_readlane_b32 s9, v42, 19
+; SI-NEXT: v_readlane_b32 s9, v43, 24
; SI-NEXT: s_add_i32 s21, s9, 3
-; SI-NEXT: v_readlane_b32 s11, v42, 17
-; SI-NEXT: v_readlane_b32 s12, v42, 14
+; SI-NEXT: v_readlane_b32 s11, v43, 22
+; SI-NEXT: v_readlane_b32 s12, v43, 19
; SI-NEXT: s_and_b32 s9, s21, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 8
; SI-NEXT: s_add_i32 s12, s12, 3
; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: v_readlane_b32 s11, v42, 15
+; SI-NEXT: v_readlane_b32 s11, v43, 20
; SI-NEXT: s_and_b32 s12, s12, 0xff
; SI-NEXT: s_addk_i32 s9, 0x300
; SI-NEXT: s_lshl_b32 s11, s11, 24
@@ -158097,15 +158134,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s9, s9, 0xffff
; SI-NEXT: s_or_b32 s11, s11, s12
; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: v_readlane_b32 s11, v42, 13
+; SI-NEXT: v_readlane_b32 s11, v43, 18
; SI-NEXT: s_add_i32 s11, s11, 3
-; SI-NEXT: v_readlane_b32 s12, v42, 12
-; SI-NEXT: v_readlane_b32 s13, v42, 10
+; SI-NEXT: v_readlane_b32 s12, v43, 17
+; SI-NEXT: v_readlane_b32 s13, v43, 15
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_lshl_b32 s12, s12, 8
; SI-NEXT: s_add_i32 s13, s13, 3
; SI-NEXT: s_or_b32 s11, s12, s11
-; SI-NEXT: v_readlane_b32 s12, v42, 11
+; SI-NEXT: v_readlane_b32 s12, v43, 16
; SI-NEXT: s_and_b32 s13, s13, 0xff
; SI-NEXT: s_addk_i32 s11, 0x300
; SI-NEXT: s_lshl_b32 s12, s12, 24
@@ -158113,16 +158150,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s11, s11, 0xffff
; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: s_or_b32 s11, s12, s11
-; SI-NEXT: v_readlane_b32 s12, v42, 9
+; SI-NEXT: v_readlane_b32 s12, v43, 14
; SI-NEXT: s_add_i32 s15, s16, 0x3000000
; SI-NEXT: s_add_i32 s12, s12, 3
-; SI-NEXT: v_readlane_b32 s13, v42, 8
-; SI-NEXT: v_readlane_b32 s16, v42, 6
+; SI-NEXT: v_readlane_b32 s13, v43, 13
+; SI-NEXT: v_readlane_b32 s16, v43, 11
; SI-NEXT: s_and_b32 s12, s12, 0xff
; SI-NEXT: s_lshl_b32 s13, s13, 8
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: s_or_b32 s12, s13, s12
-; SI-NEXT: v_readlane_b32 s13, v42, 7
+; SI-NEXT: v_readlane_b32 s13, v43, 12
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_addk_i32 s12, 0x300
; SI-NEXT: s_lshl_b32 s13, s13, 24
@@ -158130,16 +158167,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s12, s12, 0xffff
; SI-NEXT: s_or_b32 s13, s13, s16
; SI-NEXT: s_or_b32 s12, s13, s12
-; SI-NEXT: v_readlane_b32 s13, v42, 5
+; SI-NEXT: v_readlane_b32 s13, v43, 10
; SI-NEXT: s_add_i32 s40, s17, 0x3000000
; SI-NEXT: s_add_i32 s13, s13, 3
-; SI-NEXT: v_readlane_b32 s16, v42, 4
-; SI-NEXT: v_readlane_b32 s17, v42, 2
+; SI-NEXT: v_readlane_b32 s16, v43, 9
+; SI-NEXT: v_readlane_b32 s17, v43, 7
; SI-NEXT: s_and_b32 s13, s13, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 8
; SI-NEXT: s_add_i32 s17, s17, 3
; SI-NEXT: s_or_b32 s13, s16, s13
-; SI-NEXT: v_readlane_b32 s16, v42, 3
+; SI-NEXT: v_readlane_b32 s16, v43, 8
; SI-NEXT: s_and_b32 s17, s17, 0xff
; SI-NEXT: s_addk_i32 s13, 0x300
; SI-NEXT: s_lshl_b32 s16, s16, 24
@@ -158147,16 +158184,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s13, s13, 0xffff
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_or_b32 s13, s16, s13
-; SI-NEXT: v_readlane_b32 s16, v42, 1
+; SI-NEXT: v_readlane_b32 s16, v43, 6
; SI-NEXT: s_add_i32 s41, s18, 0x3000000
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 0
-; SI-NEXT: v_readlane_b32 s18, v43, 62
+; SI-NEXT: v_readlane_b32 s17, v43, 5
+; SI-NEXT: v_readlane_b32 s18, v43, 3
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 63
+; SI-NEXT: v_readlane_b32 s17, v43, 4
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -158165,16 +158202,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s17, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v43, 61
+; SI-NEXT: v_readlane_b32 s16, v43, 2
; SI-NEXT: s_add_i32 s42, s19, 0x3000000
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s18, v43, 60
-; SI-NEXT: v_readlane_b32 s19, v43, 58
+; SI-NEXT: v_readlane_b32 s18, v43, 1
+; SI-NEXT: v_readlane_b32 s19, v44, 63
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 8
; SI-NEXT: s_add_i32 s19, s19, 3
; SI-NEXT: s_or_b32 s16, s18, s16
-; SI-NEXT: v_readlane_b32 s18, v43, 59
+; SI-NEXT: v_readlane_b32 s18, v43, 0
; SI-NEXT: s_and_b32 s19, s19, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s18, s18, 24
@@ -158182,16 +158219,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s18, s18, s19
; SI-NEXT: s_or_b32 s16, s18, s16
-; SI-NEXT: v_readlane_b32 s18, v43, 57
+; SI-NEXT: v_readlane_b32 s18, v44, 62
; SI-NEXT: s_add_i32 s43, s20, 0x3000000
; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: v_readlane_b32 s19, v43, 56
-; SI-NEXT: v_readlane_b32 s20, v43, 54
+; SI-NEXT: v_readlane_b32 s19, v44, 61
+; SI-NEXT: v_readlane_b32 s20, v44, 59
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 8
; SI-NEXT: s_add_i32 s20, s20, 3
; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: v_readlane_b32 s19, v43, 55
+; SI-NEXT: v_readlane_b32 s19, v44, 60
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_addk_i32 s18, 0x300
; SI-NEXT: s_lshl_b32 s19, s19, 24
@@ -158199,15 +158236,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s18, s18, 0xffff
; SI-NEXT: s_or_b32 s19, s19, s20
; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: v_readlane_b32 s19, v43, 53
+; SI-NEXT: v_readlane_b32 s19, v44, 58
; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: v_readlane_b32 s20, v43, 52
-; SI-NEXT: v_readlane_b32 s21, v43, 50
+; SI-NEXT: v_readlane_b32 s20, v44, 57
+; SI-NEXT: v_readlane_b32 s21, v44, 55
; SI-NEXT: s_and_b32 s19, s19, 0xff
; SI-NEXT: s_lshl_b32 s20, s20, 8
; SI-NEXT: s_add_i32 s21, s21, 3
; SI-NEXT: s_or_b32 s19, s20, s19
-; SI-NEXT: v_readlane_b32 s20, v43, 51
+; SI-NEXT: v_readlane_b32 s20, v44, 56
; SI-NEXT: s_and_b32 s21, s21, 0xff
; SI-NEXT: s_addk_i32 s19, 0x300
; SI-NEXT: s_lshl_b32 s20, s20, 24
@@ -158215,16 +158252,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s19, s19, 0xffff
; SI-NEXT: s_or_b32 s20, s20, s21
; SI-NEXT: s_or_b32 s19, s20, s19
-; SI-NEXT: v_readlane_b32 s20, v43, 49
+; SI-NEXT: v_readlane_b32 s20, v44, 54
; SI-NEXT: s_add_i32 s44, s22, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s21, v43, 48
-; SI-NEXT: v_readlane_b32 s22, v43, 46
+; SI-NEXT: v_readlane_b32 s21, v44, 53
+; SI-NEXT: v_readlane_b32 s22, v44, 51
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s21, s21, 8
; SI-NEXT: s_add_i32 s22, s22, 3
; SI-NEXT: s_or_b32 s20, s21, s20
-; SI-NEXT: v_readlane_b32 s21, v43, 47
+; SI-NEXT: v_readlane_b32 s21, v44, 52
; SI-NEXT: s_and_b32 s22, s22, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s21, s21, 24
@@ -158233,16 +158270,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s21, s21, s22
; SI-NEXT: s_or_b32 s20, s21, s20
; SI-NEXT: s_add_i32 s21, s20, 0x3000000
-; SI-NEXT: v_readlane_b32 s20, v43, 43
+; SI-NEXT: v_readlane_b32 s20, v44, 1
; SI-NEXT: s_add_i32 s45, s23, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s22, v43, 42
-; SI-NEXT: v_readlane_b32 s23, v43, 44
+; SI-NEXT: v_readlane_b32 s22, v44, 0
+; SI-NEXT: v_readlane_b32 s23, v44, 49
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s22, s22, 8
; SI-NEXT: s_add_i32 s23, s23, 3
; SI-NEXT: s_or_b32 s20, s22, s20
-; SI-NEXT: v_readlane_b32 s22, v43, 45
+; SI-NEXT: v_readlane_b32 s22, v44, 50
; SI-NEXT: s_and_b32 s23, s23, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s22, s22, 24
@@ -158251,15 +158288,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s22, s22, s23
; SI-NEXT: s_or_b32 s20, s22, s20
; SI-NEXT: s_add_i32 s22, s20, 0x3000000
-; SI-NEXT: v_readlane_b32 s20, v43, 41
+; SI-NEXT: v_readlane_b32 s20, v44, 41
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s23, v43, 40
-; SI-NEXT: v_readlane_b32 s24, v43, 38
+; SI-NEXT: v_readlane_b32 s23, v44, 40
+; SI-NEXT: v_readlane_b32 s24, v44, 3
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s23, s23, 8
; SI-NEXT: s_add_i32 s24, s24, 3
; SI-NEXT: s_or_b32 s20, s23, s20
-; SI-NEXT: v_readlane_b32 s23, v43, 39
+; SI-NEXT: v_readlane_b32 s23, v44, 2
; SI-NEXT: s_and_b32 s24, s24, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s23, s23, 24
@@ -158268,134 +158305,136 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s23, s23, s24
; SI-NEXT: s_or_b32 s20, s23, s20
; SI-NEXT: s_add_i32 s23, s20, 0x3000000
-; SI-NEXT: v_readlane_b32 s20, v43, 37
+; SI-NEXT: v_readlane_b32 s20, v44, 39
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s24, v43, 36
-; SI-NEXT: v_readlane_b32 s25, v43, 34
+; SI-NEXT: v_readlane_b32 s24, v44, 38
+; SI-NEXT: v_readlane_b32 s25, v44, 36
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s24, s24, 8
; SI-NEXT: s_add_i32 s25, s25, 3
; SI-NEXT: s_or_b32 s20, s24, s20
-; SI-NEXT: v_readlane_b32 s24, v43, 35
+; SI-NEXT: v_readlane_b32 s24, v44, 37
; SI-NEXT: s_and_b32 s25, s25, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s24, s24, 24
; SI-NEXT: s_lshl_b32 s25, s25, 16
; SI-NEXT: s_and_b32 s20, s20, 0xffff
; SI-NEXT: s_or_b32 s24, s24, s25
-; SI-NEXT: s_and_b32 s46, s46, 0xff
+; SI-NEXT: s_add_i32 s16, s16, 0x3000000
+; SI-NEXT: s_add_i32 s18, s18, 0x3000000
; SI-NEXT: s_or_b32 s20, s24, s20
-; SI-NEXT: v_readlane_b32 s24, v43, 3
-; SI-NEXT: s_lshl_b32 s46, s46, 16
-; SI-NEXT: s_addk_i32 s56, 0x300
+; SI-NEXT: v_readlane_b32 s24, v44, 7
; SI-NEXT: s_add_i32 s24, s24, 3
-; SI-NEXT: v_readlane_b32 s25, v43, 2
-; SI-NEXT: v_readlane_b32 s26, v43, 1
-; SI-NEXT: s_or_b32 s46, s47, s46
-; SI-NEXT: s_and_b32 s47, s56, 0xffff
-; SI-NEXT: s_add_i32 s7, s7, 0x3000000
-; SI-NEXT: s_add_i32 s9, s9, 0x3000000
+; SI-NEXT: v_readlane_b32 s25, v44, 6
+; SI-NEXT: v_readlane_b32 s26, v44, 5
+; SI-NEXT: s_and_b32 s79, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s77, s18, 16
+; SI-NEXT: s_and_b32 s18, s16, 0xffff0000
; SI-NEXT: s_and_b32 s24, s24, 0xff
; SI-NEXT: s_lshl_b32 s25, s25, 8
; SI-NEXT: s_add_i32 s26, s26, 3
-; SI-NEXT: s_or_b32 s56, s46, s47
-; SI-NEXT: s_add_i32 s47, s58, 0x3000000
-; SI-NEXT: s_add_i32 s58, s59, 0x3000000
-; SI-NEXT: s_add_i32 s10, s10, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s18, 56
+; SI-NEXT: s_and_b32 s18, s17, 0xffff0000
+; SI-NEXT: s_and_b32 s46, s46, 0xff
+; SI-NEXT: s_add_i32 s13, s13, 0x3000000
; SI-NEXT: s_or_b32 s24, s25, s24
-; SI-NEXT: v_readlane_b32 s25, v43, 0
+; SI-NEXT: v_readlane_b32 s25, v44, 4
; SI-NEXT: s_and_b32 s26, s26, 0xff
-; SI-NEXT: s_and_b32 s73, s9, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s9, 16
-; SI-NEXT: s_and_b32 s9, s7, 0xffff0000
-; SI-NEXT: s_add_i32 s6, s6, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s18, 57
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_lshl_b32 s46, s46, 16
+; SI-NEXT: s_addk_i32 s56, 0x300
+; SI-NEXT: s_add_i32 s12, s12, 0x3000000
+; SI-NEXT: s_add_i32 s19, s19, 0x3000000
; SI-NEXT: s_addk_i32 s24, 0x300
; SI-NEXT: s_lshl_b32 s25, s25, 24
; SI-NEXT: s_lshl_b32 s26, s26, 16
-; SI-NEXT: s_and_b32 s63, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s79, s17, 16
-; SI-NEXT: v_writelane_b32 v42, s9, 50
-; SI-NEXT: s_lshl_b32 s17, s7, 16
-; SI-NEXT: s_lshl_b32 s7, s10, 16
-; SI-NEXT: s_add_i32 s8, s8, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s17, 58
+; SI-NEXT: s_and_b32 s17, s13, 0xffff0000
+; SI-NEXT: s_or_b32 s46, s47, s46
+; SI-NEXT: s_and_b32 s47, s56, 0xffff
; SI-NEXT: s_and_b32 s24, s24, 0xffff
; SI-NEXT: s_or_b32 s25, s25, s26
-; SI-NEXT: v_writelane_b32 v42, s7, 51
-; SI-NEXT: s_and_b32 s7, s6, 0xffff0000
+; SI-NEXT: s_and_b32 s72, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s29, s23, 16
+; SI-NEXT: s_and_b32 s73, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s23, s22, 16
+; SI-NEXT: s_and_b32 s78, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s22, s19, 16
+; SI-NEXT: v_writelane_b32 v43, s17, 59
+; SI-NEXT: s_lshl_b32 s19, s13, 16
+; SI-NEXT: s_and_b32 s13, s12, 0xffff0000
+; SI-NEXT: s_or_b32 s56, s46, s47
+; SI-NEXT: s_add_i32 s11, s11, 0x3000000
; SI-NEXT: s_or_b32 s24, s25, s24
-; SI-NEXT: v_writelane_b32 v42, s7, 52
-; SI-NEXT: s_and_b32 s7, s8, 0xffff0000
+; SI-NEXT: v_writelane_b32 v43, s13, 60
+; SI-NEXT: s_lshl_b32 s12, s12, 16
; SI-NEXT: s_add_i32 s4, s4, 0x3000000
; SI-NEXT: s_add_i32 s5, s5, 0x3000000
; SI-NEXT: s_add_i32 s46, s60, 0x3000000
+; SI-NEXT: s_add_i32 s47, s58, 0x3000000
; SI-NEXT: s_add_i32 s56, s56, 0x3000000
; SI-NEXT: s_add_i32 s57, s57, 0x3000000
-; SI-NEXT: s_add_i32 s11, s11, 0x3000000
-; SI-NEXT: s_add_i32 s12, s12, 0x3000000
-; SI-NEXT: s_add_i32 s13, s13, 0x3000000
-; SI-NEXT: s_add_i32 s16, s16, 0x3000000
-; SI-NEXT: s_add_i32 s18, s18, 0x3000000
-; SI-NEXT: s_add_i32 s19, s19, 0x3000000
+; SI-NEXT: s_add_i32 s58, s59, 0x3000000
+; SI-NEXT: s_add_i32 s8, s8, 0x3000000
+; SI-NEXT: s_add_i32 s6, s6, 0x3000000
+; SI-NEXT: s_add_i32 s10, s10, 0x3000000
+; SI-NEXT: s_add_i32 s7, s7, 0x3000000
+; SI-NEXT: s_add_i32 s9, s9, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 0x3000000
; SI-NEXT: s_add_i32 s24, s24, 0x3000000
-; SI-NEXT: v_writelane_b32 v42, s7, 53
-; SI-NEXT: s_lshl_b32 s7, s8, 16
+; SI-NEXT: v_writelane_b32 v43, s12, 61
+; SI-NEXT: s_and_b32 s12, s11, 0xffff0000
; SI-NEXT: s_and_b32 s27, s24, 0xffff0000
; SI-NEXT: s_lshl_b32 s26, s24, 16
; SI-NEXT: s_and_b32 s24, s20, 0xffff0000
; SI-NEXT: s_lshl_b32 s20, s20, 16
-; SI-NEXT: s_and_b32 s35, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s29, s23, 16
-; SI-NEXT: s_and_b32 s90, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s22, 16
-; SI-NEXT: s_and_b32 s25, s21, 0xffff0000
+; SI-NEXT: s_and_b32 s75, s21, 0xffff0000
; SI-NEXT: s_lshl_b32 s21, s21, 16
-; SI-NEXT: s_and_b32 s75, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s22, s19, 16
-; SI-NEXT: s_and_b32 s61, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s18, 16
-; SI-NEXT: s_and_b32 s77, s16, 0xffff0000
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_and_b32 s89, s13, 0xffff0000
-; SI-NEXT: s_lshl_b32 s19, s13, 16
-; SI-NEXT: s_and_b32 s13, s12, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s12, 16
-; SI-NEXT: s_and_b32 s60, s11, 0xffff0000
+; SI-NEXT: v_writelane_b32 v43, s12, 62
; SI-NEXT: s_lshl_b32 s18, s11, 16
-; SI-NEXT: s_and_b32 s23, s10, 0xffff0000
+; SI-NEXT: s_and_b32 s95, s9, 0xffff0000
+; SI-NEXT: s_lshl_b32 s9, s9, 16
+; SI-NEXT: s_and_b32 s30, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s17, s7, 16
+; SI-NEXT: s_and_b32 s34, s10, 0xffff0000
+; SI-NEXT: s_lshl_b32 s31, s10, 16
+; SI-NEXT: s_and_b32 s35, s6, 0xffff0000
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: v_writelane_b32 v42, s7, 54
-; SI-NEXT: s_and_b32 s72, s58, 0xffff0000
+; SI-NEXT: s_and_b32 s37, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s36, s8, 16
+; SI-NEXT: s_and_b32 s38, s58, 0xffff0000
; SI-NEXT: s_lshl_b32 s99, s58, 16
-; SI-NEXT: s_and_b32 s7, s57, 0xffff0000
-; SI-NEXT: s_lshl_b32 s57, s57, 16
+; SI-NEXT: s_and_b32 s48, s57, 0xffff0000
+; SI-NEXT: s_lshl_b32 s39, s57, 16
; SI-NEXT: s_and_b32 s49, s56, 0xffff0000
-; SI-NEXT: s_lshl_b32 s8, s56, 16
+; SI-NEXT: s_lshl_b32 s98, s56, 16
; SI-NEXT: s_and_b32 s51, s47, 0xffff0000
; SI-NEXT: s_lshl_b32 s50, s47, 16
; SI-NEXT: s_and_b32 s52, s46, 0xffff0000
-; SI-NEXT: s_lshl_b32 s97, s46, 16
+; SI-NEXT: s_lshl_b32 s12, s46, 16
; SI-NEXT: s_and_b32 s54, s45, 0xffff0000
; SI-NEXT: s_lshl_b32 s53, s45, 16
; SI-NEXT: s_and_b32 s55, s44, 0xffff0000
-; SI-NEXT: s_lshl_b32 s28, s44, 16
+; SI-NEXT: s_lshl_b32 s96, s44, 16
; SI-NEXT: s_and_b32 s65, s43, 0xffff0000
; SI-NEXT: s_lshl_b32 s64, s43, 16
; SI-NEXT: s_and_b32 s66, s42, 0xffff0000
; SI-NEXT: s_lshl_b32 s87, s42, 16
-; SI-NEXT: s_and_b32 s68, s41, 0xffff0000
+; SI-NEXT: s_and_b32 s45, s41, 0xffff0000
; SI-NEXT: s_lshl_b32 s67, s41, 16
-; SI-NEXT: s_and_b32 s69, s40, 0xffff0000
+; SI-NEXT: s_and_b32 s57, s40, 0xffff0000
; SI-NEXT: s_lshl_b32 s86, s40, 16
-; SI-NEXT: s_and_b32 s62, s15, 0xffff0000
+; SI-NEXT: s_and_b32 s13, s15, 0xffff0000
; SI-NEXT: s_lshl_b32 s70, s15, 16
; SI-NEXT: s_and_b32 s80, s14, 0xffff0000
; SI-NEXT: s_lshl_b32 s85, s14, 16
-; SI-NEXT: s_and_b32 s92, s5, 0xffff0000
-; SI-NEXT: s_lshl_b32 s11, s5, 16
-; SI-NEXT: s_and_b32 s83, s4, 0xffff0000
+; SI-NEXT: s_and_b32 s82, s5, 0xffff0000
+; SI-NEXT: s_lshl_b32 s25, s5, 16
+; SI-NEXT: s_and_b32 s28, s4, 0xffff0000
; SI-NEXT: s_lshl_b32 s84, s4, 16
-; SI-NEXT: v_writelane_b32 v42, s7, 55
+; SI-NEXT: v_writelane_b32 v43, s9, 63
; SI-NEXT: .LBB89_3: ; %end
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -158410,134 +158449,136 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s23
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s77
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 56
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 57
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_readlane_b32 s4, v43, 58
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 59
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 60
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_readlane_b32 s4, v43, 61
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 62
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
+; SI-NEXT: v_readlane_b32 s4, v43, 63
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 50
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23
-; SI-NEXT: v_readlane_b32 s4, v42, 51
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s31
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 52
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 53
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT: v_readlane_b32 s4, v42, 54
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 55
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s39
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s98
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -158551,7 +158592,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -158565,7 +158606,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s96
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -158584,21 +158625,21 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
@@ -158612,14 +158653,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s25
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
@@ -158666,109 +158707,118 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB89_4:
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: s_mov_b32 s7, s6
-; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: v_readlane_b32 s58, v43, 19
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: s_mov_b32 s95, s47
-; SI-NEXT: s_mov_b32 s94, s21
-; SI-NEXT: s_mov_b32 s93, s61
-; SI-NEXT: s_mov_b32 s34, s73
-; SI-NEXT: s_mov_b32 s91, s75
-; SI-NEXT: v_readlane_b32 s56, v43, 10
-; SI-NEXT: s_mov_b32 s36, s63
-; SI-NEXT: s_mov_b32 s38, s59
-; SI-NEXT: s_mov_b32 s37, s42
-; SI-NEXT: v_readlane_b32 s30, v43, 17
-; SI-NEXT: v_readlane_b32 s98, v43, 6
-; SI-NEXT: s_mov_b32 s46, s45
-; SI-NEXT: s_mov_b32 s31, s43
-; SI-NEXT: s_mov_b32 s78, s40
-; SI-NEXT: v_readlane_b32 s15, v43, 14
-; SI-NEXT: s_mov_b32 s39, s57
-; SI-NEXT: s_mov_b32 s48, s13
-; SI-NEXT: v_readlane_b32 s41, v43, 13
-; SI-NEXT: v_readlane_b32 s44, v43, 5
-; SI-NEXT: v_readlane_b32 s9, v43, 11
-; SI-NEXT: v_readlane_b32 s14, v43, 12
-; SI-NEXT: v_readlane_b32 s81, v43, 9
-; SI-NEXT: v_readlane_b32 s10, v43, 16
-; SI-NEXT: v_readlane_b32 s12, v43, 4
-; SI-NEXT: v_readlane_b32 s96, v43, 7
-; SI-NEXT: v_readlane_b32 s82, v43, 8
-; SI-NEXT: v_readlane_b32 s71, v43, 15
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s89, s76
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s88, s57
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s94, s26
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s93, s74
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s90, s63
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s91, s77
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: v_readlane_b32 s61, v44, 31
+; SI-NEXT: v_readlane_b32 s74, v44, 23
+; SI-NEXT: v_readlane_b32 s60, v44, 30
+; SI-NEXT: v_readlane_b32 s63, v44, 22
+; SI-NEXT: v_readlane_b32 s8, v44, 15
+; SI-NEXT: v_readlane_b32 s46, v44, 27
+; SI-NEXT: v_readlane_b32 s59, v44, 28
+; SI-NEXT: v_readlane_b32 s68, v44, 29
+; SI-NEXT: s_mov_b32 s92, s56
+; SI-NEXT: v_readlane_b32 s69, v44, 26
+; SI-NEXT: s_mov_b32 s76, s58
+; SI-NEXT: v_readlane_b32 s44, v44, 11
+; SI-NEXT: v_readlane_b32 s42, v44, 34
+; SI-NEXT: v_readlane_b32 s58, v44, 24
+; SI-NEXT: v_readlane_b32 s40, v44, 25
+; SI-NEXT: v_readlane_b32 s41, v44, 19
+; SI-NEXT: v_readlane_b32 s56, v44, 32
+; SI-NEXT: v_readlane_b32 s47, v44, 33
+; SI-NEXT: v_readlane_b32 s14, v44, 18
+; SI-NEXT: v_readlane_b32 s11, v44, 10
+; SI-NEXT: v_readlane_b32 s9, v44, 16
+; SI-NEXT: v_readlane_b32 s10, v44, 17
+; SI-NEXT: v_readlane_b32 s15, v44, 14
+; SI-NEXT: v_readlane_b32 s7, v44, 9
+; SI-NEXT: v_readlane_b32 s62, v44, 8
+; SI-NEXT: v_readlane_b32 s97, v44, 12
+; SI-NEXT: v_readlane_b32 s83, v44, 13
+; SI-NEXT: v_readlane_b32 s81, v44, 20
+; SI-NEXT: v_readlane_b32 s71, v44, 21
; SI-NEXT: ; kill: killed $sgpr6
; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; implicit-def: $sgpr27
; SI-NEXT: ; implicit-def: $sgpr20
; SI-NEXT: ; implicit-def: $sgpr24
; SI-NEXT: ; implicit-def: $sgpr29
-; SI-NEXT: ; implicit-def: $sgpr35
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr73
; SI-NEXT: ; implicit-def: $sgpr21
-; SI-NEXT: ; implicit-def: $sgpr25
-; SI-NEXT: ; implicit-def: $sgpr22
; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr78
; SI-NEXT: ; implicit-def: $sgpr77
; SI-NEXT: ; implicit-def: $sgpr79
-; SI-NEXT: ; implicit-def: $sgpr63
+; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: ; implicit-def: $sgpr19
-; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr88
-; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr18
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr73
-; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr95
+; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr31
+; SI-NEXT: ; implicit-def: $sgpr34
; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr35
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr37
; SI-NEXT: ; implicit-def: $sgpr99
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr39
+; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr98
; SI-NEXT: ; implicit-def: $sgpr49
; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: ; implicit-def: $sgpr51
-; SI-NEXT: ; implicit-def: $sgpr97
+; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr52
; SI-NEXT: ; implicit-def: $sgpr53
; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr96
; SI-NEXT: ; implicit-def: $sgpr55
; SI-NEXT: ; implicit-def: $sgpr64
; SI-NEXT: ; implicit-def: $sgpr65
; SI-NEXT: ; implicit-def: $sgpr87
; SI-NEXT: ; implicit-def: $sgpr66
; SI-NEXT: ; implicit-def: $sgpr67
-; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr45
; SI-NEXT: ; implicit-def: $sgpr86
-; SI-NEXT: ; implicit-def: $sgpr69
+; SI-NEXT: ; implicit-def: $sgpr57
; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr85
; SI-NEXT: ; implicit-def: $sgpr80
-; SI-NEXT: ; implicit-def: $sgpr11
-; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr25
+; SI-NEXT: ; implicit-def: $sgpr82
; SI-NEXT: ; implicit-def: $sgpr84
-; SI-NEXT: ; implicit-def: $sgpr83
+; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: s_branch .LBB89_2
;
; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar:
@@ -158831,13 +158881,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v27
; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
@@ -158849,46 +158900,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200
@@ -158897,34 +158944,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v28
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -158943,6 +158993,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328
@@ -158951,12 +159006,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36
-; VI-NEXT: s_waitcnt vmcnt(11)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
-; VI-NEXT: s_waitcnt vmcnt(10)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44
@@ -158965,47 +159016,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116
; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:172
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:196
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
@@ -159015,46 +159064,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB89_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -159071,11 +159124,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -159099,6 +159151,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v17, v10
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
@@ -159115,38 +159168,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v40, v42
+; VI-NEXT: v_mov_b32_e32 v42, v44
+; VI-NEXT: v_mov_b32_e32 v44, v45
+; VI-NEXT: v_mov_b32_e32 v45, v62
+; VI-NEXT: v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v24
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -159154,77 +159212,74 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v45, v62
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v32, v1
; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v22
-; VI-NEXT: v_mov_b32_e32 v41, v24
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v34, v0
+; VI-NEXT: v_mov_b32_e32 v33, v0
; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v37, v1
-; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v55, v26
+; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v50, v26
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
-; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v43, v27
+; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v51, v0
-; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v35, v1
-; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v53, v28
+; VI-NEXT: v_mov_b32_e32 v53, v1
+; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v52, v28
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v36, v0
+; VI-NEXT: v_mov_b32_e32 v55, v0
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v35, v0
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v41, v1
+; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v63, v27
+; VI-NEXT: v_mov_b32_e32 v46, v57
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v56, v0
+; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v58, v1
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v61, v60
-; VI-NEXT: v_mov_b32_e32 v60, v59
+; VI-NEXT: v_mov_b32_e32 v56, v1
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v61, v59
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -159236,55 +159291,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v50, v0
+; VI-NEXT: v_mov_b32_e32 v58, v0
; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v52, v0
-; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v46, v1
-; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v60, v1
+; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v63, v0
-; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v54, v0
+; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v47, v1
-; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v57, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
@@ -159316,12 +159369,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_cbranch_execnz .LBB89_3
; VI-NEXT: .LBB89_2: ; %cmp.true
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59
-; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -159340,165 +159391,147 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: s_lshl_b32 s9, s19, 8
; VI-NEXT: s_add_i32 s16, s16, 3
; VI-NEXT: s_lshl_b32 s10, s17, 8
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59
+; VI-NEXT: v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62
-; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44
-; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45
-; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44
+; VI-NEXT: v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42
-; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40
-; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60
-; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61
-; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46
+; VI-NEXT: v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48
; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38
; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50
+; VI-NEXT: v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v36
; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49
; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57
-; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v58
+; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34
; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14
@@ -159507,67 +159540,78 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36
-; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52
-; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54
-; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v35
+; VI-NEXT: v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v25
+; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v59
+; VI-NEXT: v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13
-; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT: v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51
; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59
-; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1
+; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v63
+; VI-NEXT: v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12
-; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25
+; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -159591,15 +159635,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10
; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55
; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53
-; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v52
+; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v53
+; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9
; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10
+; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27
-; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28
-; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -159615,18 +159658,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42
; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40
-; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1
-; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11
-; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
@@ -159666,19 +159705,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46
; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2
+; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4
; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4
@@ -159745,35 +159794,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB89_4:
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v61, v60
-; VI-NEXT: v_mov_b32_e32 v60, v59
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v61, v59
+; VI-NEXT: v_mov_b32_e32 v46, v57
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v40, v42
+; VI-NEXT: v_mov_b32_e32 v42, v44
+; VI-NEXT: v_mov_b32_e32 v44, v45
; VI-NEXT: v_mov_b32_e32 v45, v62
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v57, v5
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v47, v4
-; VI-NEXT: v_mov_b32_e32 v63, v3
-; VI-NEXT: v_mov_b32_e32 v53, v28
-; VI-NEXT: v_mov_b32_e32 v43, v27
-; VI-NEXT: v_mov_b32_e32 v55, v26
-; VI-NEXT: v_mov_b32_e32 v41, v24
-; VI-NEXT: v_mov_b32_e32 v54, v22
+; VI-NEXT: v_mov_b32_e32 v54, v3
+; VI-NEXT: v_mov_b32_e32 v52, v28
+; VI-NEXT: v_mov_b32_e32 v63, v27
+; VI-NEXT: v_mov_b32_e32 v50, v26
+; VI-NEXT: v_mov_b32_e32 v34, v24
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_branch .LBB89_2
@@ -159835,18 +159887,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43
; GFX9-NEXT: s_waitcnt vmcnt(23)
@@ -159875,10 +159927,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
@@ -159890,7 +159942,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
@@ -159938,7 +159990,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
@@ -159965,23 +160017,23 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
@@ -159994,48 +160046,49 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100
; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:116
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:140
; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:156
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164
-; GFX9-NEXT: s_waitcnt vmcnt(21)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: s_waitcnt vmcnt(22)
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276
; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:316
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
@@ -160046,55 +160099,54 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(36)
-; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: s_waitcnt vmcnt(38)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
@@ -160104,7 +160156,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB89_2
@@ -160117,7 +160169,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -160154,10 +160206,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
@@ -160173,13 +160225,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160187,7 +160239,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -160228,8 +160280,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v52, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v50, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160247,16 +160299,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v48, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_mov_b32_e32 v33, v45
+; GFX9-NEXT: v_mov_b32_e32 v33, v46
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -160269,7 +160321,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160278,7 +160330,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160286,121 +160338,122 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v34, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_mov_b32_e32 v46, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v35, v45
-; GFX9-NEXT: v_mov_b32_e32 v45, v61
-; GFX9-NEXT: v_mov_b32_e32 v61, v42
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v38, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v54, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v41, v57
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v57, v59
; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v56, v42
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB89_3
; GFX9-NEXT: .LBB89_2:
; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v33, v45
-; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v33, v46
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v56, v61
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: .LBB89_3: ; %Flow
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -160603,7 +160656,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v1, 3, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -160663,11 +160716,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
@@ -160702,7 +160755,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; GFX9-NEXT: v_add_u32_e32 v24, 3, v24
-; GFX9-NEXT: v_add_u32_e32 v26, 3, v61
+; GFX9-NEXT: v_add_u32_e32 v26, 3, v62
; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24
; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48
@@ -160711,7 +160764,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v26, 3, v45
+; GFX9-NEXT: v_add_u32_e32 v26, 3, v61
; GFX9-NEXT: v_add_u32_e32 v20, 3, v20
; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20
@@ -160720,7 +160773,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: v_add_u32_e32 v26, 3, v56
; GFX9-NEXT: v_add_u32_e32 v21, 3, v21
-; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21
; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21
@@ -162564,29 +162617,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -162675,31 +162729,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v39
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v55
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v43
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v9, 1.0, v47
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v60
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
@@ -162710,21 +162764,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v42
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v61
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; kill: killed $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v36
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v54
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45
+; SI-NEXT: ; kill: killed $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr35
@@ -162748,30 +162802,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; kill: killed $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v4
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v7
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; kill: killed $vgpr7
; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v5
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10
; SI-NEXT: ; kill: killed $vgpr7
@@ -162783,7 +162837,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; kill: killed $vgpr7
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -162956,9 +163010,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v3
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr3
@@ -162971,10 +163025,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(4)
@@ -163004,7 +163058,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16
@@ -163016,7 +163070,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16
@@ -163028,7 +163082,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16
@@ -163037,88 +163091,148 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v15, v1, v16, 16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16
-; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: v_alignbit_b32 v13, v1, v59, 16
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v11, v1, v51, 16
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53
+; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46
-; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT: v_alignbit_b32 v5, v1, v6, 16
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33
; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v63
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; kill: killed $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
-; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41
+; SI-NEXT: v_alignbit_b32 v3, v1, v43, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63
-; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; SI-NEXT: v_alignbit_b32 v2, v1, v36, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6
+; SI-NEXT: v_alignbit_b32 v22, v1, v6, 16
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v56
+; SI-NEXT: v_alignbit_b32 v20, v6, v7, 16
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v62
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_alignbit_b32 v1, v1, v60, 16
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; kill: killed $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_alignbit_b32 v17, v6, v7, 16
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v14, v6, v39, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v42
+; SI-NEXT: v_alignbit_b32 v10, v6, v44, 16
+; SI-NEXT: v_mov_b32_e32 v63, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v61
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v27
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v16
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v58
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v49
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v51
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v53
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v12
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v57
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v24
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v56
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v42
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v30
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24
@@ -163221,9 +163335,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16
@@ -163231,49 +163342,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12
+; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16
-; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16
-; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16
+; SI-NEXT: v_alignbit_b32 v7, v7, v45, 16
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8
@@ -163378,173 +163473,135 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v62
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v47
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19
-; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; kill: killed $vgpr6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; kill: killed $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: .LBB90_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB90_4
; SI-NEXT: ; %bb.3: ; %cmp.true
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v39
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v58
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33
; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v7, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v44
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v10, v63, v10, 16
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v19
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -163553,39 +163610,41 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21
; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
@@ -163594,7 +163653,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -163613,6 +163671,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49
; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51
; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v56
+; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55
; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41
; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -163620,91 +163686,80 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30
+; SI-NEXT: v_alignbit_b32 v20, v16, v18, 16
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v18
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_alignbit_b32 v18, v22, v18, 16
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v22, v22, v16, 16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v57
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v16
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
+; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v28, v28, v27, 16
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v16
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v24
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v30
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v21
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32
@@ -163724,8 +163779,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35
@@ -163987,8 +164040,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: .LBB90_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xff, v38
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
@@ -164442,7 +164495,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; SI-NEXT: v_or_b32_e32 v4, v4, v5
@@ -164475,7 +164528,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v61
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v4
@@ -164506,17 +164559,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v63
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
@@ -164546,7 +164599,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -164597,12 +164650,65 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; kill: killed $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr34
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; implicit-def: $vgpr60
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr42
@@ -164615,11 +164721,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: ; implicit-def: $vgpr62
; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr52
@@ -164722,173 +164830,115 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; kill: killed $vgpr33
; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB90_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14]
@@ -164913,17 +164963,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, v46
; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30]
-; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11
+; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v11
; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v46, v63
-; VI-NEXT: v_mov_b32_e32 v63, v50
+; VI-NEXT: v_mov_b32_e32 v46, v51
+; VI-NEXT: v_mov_b32_e32 v45, v50
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28]
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9
@@ -164945,7 +164993,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18]
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10
@@ -164954,30 +165003,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8
; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30
-; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v30
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v19
+; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v19
+; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v18
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v17
; VI-NEXT: v_mov_b32_e32 v47, v34
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v2
; VI-NEXT: .LBB90_2: ; %Flow
; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v34, v36
; VI-NEXT: s_xor_b64 exec, exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB90_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18
; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34
@@ -165596,109 +165644,112 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v25
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v25
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v24
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v23
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v18
-; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11
@@ -165712,69 +165763,63 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8
; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32
; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31
-; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v31
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30
-; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v31
+; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v19
+; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v19
+; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v18
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v17
; VI-NEXT: .LBB90_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v44
+; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v63
; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43
; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v43, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -165787,22 +165832,28 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -165850,10 +165901,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45
; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -165913,48 +165968,34 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38
; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v53
; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62
; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -165965,10 +166006,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -165978,9 +166019,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -165991,10 +166032,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -166004,9 +166045,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -166017,8 +166058,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -166028,9 +166069,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -166041,10 +166082,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -166055,9 +166096,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -166068,26 +166109,34 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v34
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45
; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
@@ -166140,714 +166189,716 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v46, v15
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: v_mov_b32_e32 v47, v16
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr36
-; GFX9-NEXT: ; kill: killed $vgpr59
+; GFX9-NEXT: v_mov_b32_e32 v36, v15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: v_mov_b32_e32 v37, v16
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr57
; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr54
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: ; implicit-def: $vgpr38
-; GFX9-NEXT: ; implicit-def: $vgpr51
-; GFX9-NEXT: ; implicit-def: $vgpr53
-; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr49
; GFX9-NEXT: ; implicit-def: $vgpr52
-; GFX9-NEXT: ; implicit-def: $vgpr16
-; GFX9-NEXT: ; implicit-def: $vgpr42
-; GFX9-NEXT: ; implicit-def: $vgpr39
-; GFX9-NEXT: ; implicit-def: $vgpr45
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: ; implicit-def: $vgpr62
-; GFX9-NEXT: ; implicit-def: $vgpr61
-; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: ; implicit-def: $vgpr44
-; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr49
; GFX9-NEXT: ; implicit-def: $vgpr55
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: ; implicit-def: $vgpr38
+; GFX9-NEXT: ; implicit-def: $vgpr35
+; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr45
; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr36
-; GFX9-NEXT: ; implicit-def: $vgpr37
-; GFX9-NEXT: ; implicit-def: $vgpr57
-; GFX9-NEXT: ; implicit-def: $vgpr36
+; GFX9-NEXT: ; implicit-def: $vgpr51
+; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr63
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr47
+; GFX9-NEXT: ; implicit-def: $vgpr48
+; GFX9-NEXT: ; implicit-def: $vgpr39
+; GFX9-NEXT: ; implicit-def: $vgpr44
+; GFX9-NEXT: ; implicit-def: $vgpr61
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr16
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(33)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB90_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v47
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v47
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v47
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v46
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v46
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v37
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v6
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[46:47]
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18]
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v17
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v32
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(59)
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v31
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v30
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v29
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v28
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v27
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v26
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v25
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v24
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v23
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v22
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v21
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v21
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[36:37]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8]
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v36
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v17
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB90_2: ; %Flow
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v58, v57
-; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB90_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v18
+; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX9-NEXT: s_movk_i32 s6, 0x7fff
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v33, v16, v33, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc
+; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6
+; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v39, v34, v35, vcc
+; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6
+; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v62, v33, v34, vcc
+; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6
+; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; GFX9-NEXT: s_mov_b32 s7, 0x7060302
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v33, vcc
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: v_perm_b32 v15, v17, v15, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v63, v18, v33, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_perm_b32 v16, v62, v39, s7
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v19
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: v_perm_b32 v16, v15, v63, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v20, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: v_perm_b32 v34, v15, v33, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: v_perm_b32 v33, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v20
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v20
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GFX9-NEXT: v_perm_b32 v15, v17, v15, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v22
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v19
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v17, v15, v20, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v22
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v21
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: v_perm_b32 v16, v16, v15, s7
+; GFX9-NEXT: v_perm_b32 v15, v17, v43, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v24
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v38, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v23
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v24, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v22
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v21
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v21
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v61, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v17, v15, v61, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v24
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_perm_b32 v15, v17, v22, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v26
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v23, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v25
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v26, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v50, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_perm_b32 v16, v24, v38, s7
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v24
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: v_perm_b32 v59, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v23
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v58, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v26
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v26
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: v_perm_b32 v63, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v25
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v25
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v62, v15, v18, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v28
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v28
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v60, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v27
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v28, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v27
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v26, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v56, v15, v26, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v30
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v27, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v29
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v30, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v29
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v25, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v33, v15, v25, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v32
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v29, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: s_waitcnt vmcnt(51)
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v31
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v31
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v24, v16, v17, vcc
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; GFX9-NEXT: v_perm_b32 v35, v15, v24, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_perm_b32 v15, v17, v50, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v28
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v25, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v27
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v28, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v51, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_perm_b32 v16, v26, v23, s7
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_perm_b32 v15, v17, v51, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v30
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v27, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v30, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v48, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_perm_b32 v16, v28, v25, s7
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_perm_b32 v15, v17, v48, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v32
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v32
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v29, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: s_waitcnt vmcnt(47)
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v31
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v49, v18, v19, vcc
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT: v_perm_b32 v16, v30, v27, s7
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX9-NEXT: v_perm_b32 v15, v17, v49, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v17, vcc
-; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2
+; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6
+; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v31, v18, v19, vcc
+; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v15, v16, vcc
-; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v17, v18, vcc
+; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2
+; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6
+; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v23, v15, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v52, v17, v18, vcc
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v15, vcc
-; GFX9-NEXT: v_perm_b32 v37, v1, v23, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
+; GFX9-NEXT: v_perm_b32 v60, v1, v52, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v2, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v17, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
@@ -166861,13 +166912,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v53, v2, v4, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_perm_b32 v48, v1, v20, s7
+; GFX9-NEXT: v_perm_b32 v54, v1, v53, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -166876,7 +166927,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v3, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
@@ -166897,7 +166947,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT: v_perm_b32 v50, v1, v17, s7
+; GFX9-NEXT: v_perm_b32 v33, v1, v17, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v8
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -166926,7 +166976,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc
-; GFX9-NEXT: v_perm_b32 v52, v1, v4, s7
+; GFX9-NEXT: v_perm_b32 v56, v1, v4, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v10
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -166945,480 +166995,515 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc
; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v42, vcc
; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc
-; GFX9-NEXT: v_perm_b32 v39, v1, v3, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v42, vcc
+; GFX9-NEXT: v_perm_b32 v40, v1, v3, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v12
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v12, v1, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v44, vcc
; GFX9-NEXT: v_add3_u32 v12, v12, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc
-; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v44, vcc
+; GFX9-NEXT: v_bfe_u32 v44, v1, 16, 1
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX9-NEXT: v_add3_u32 v44, v44, v1, s6
+; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v1
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc
-; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v44, v46, vcc
+; GFX9-NEXT: v_bfe_u32 v44, v11, 16, 1
+; GFX9-NEXT: v_add3_u32 v44, v44, v11, s6
+; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc
-; GFX9-NEXT: v_perm_b32 v54, v11, v2, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v44, v46, vcc
+; GFX9-NEXT: v_perm_b32 v44, v11, v2, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14
; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1
+; GFX9-NEXT: v_bfe_u32 v46, v11, 16, 1
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11
+; GFX9-NEXT: v_add3_u32 v46, v46, v11, s6
+; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc
-; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v46, v47, vcc
+; GFX9-NEXT: v_bfe_u32 v46, v14, 16, 1
+; GFX9-NEXT: v_add3_u32 v46, v46, v14, s6
+; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v13
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v46, v47, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v13
+; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT: v_bfe_u32 v58, v46, 16, 1
; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v41, vcc
-; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13
+; GFX9-NEXT: v_add3_u32 v58, v58, v46, s6
+; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v46
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46
+; GFX9-NEXT: v_bfe_u32 v46, v13, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v58, v59, vcc
+; GFX9-NEXT: v_add3_u32 v46, v46, v13, s6
+; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc
-; GFX9-NEXT: v_perm_b32 v41, v13, v1, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v47
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v46, v58, vcc
+; GFX9-NEXT: v_perm_b32 v16, v32, v29, s7
+; GFX9-NEXT: v_perm_b32 v46, v13, v1, s7
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v37
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1
-; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6
-; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13
+; GFX9-NEXT: v_bfe_u32 v58, v13, 16, 1
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v37
+; GFX9-NEXT: v_add3_u32 v58, v58, v13, s6
+; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v47
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v44, v16, v44, vcc
-; GFX9-NEXT: v_perm_b32 v16, v44, v13, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v13
+; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v58, v59, vcc
+; GFX9-NEXT: v_bfe_u32 v58, v16, 16, 1
+; GFX9-NEXT: v_add3_u32 v58, v58, v16, s6
+; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v16
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v58, v59, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v36
+; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58
+; GFX9-NEXT: v_bfe_u32 v35, v58, 16, 1
+; GFX9-NEXT: v_add3_u32 v35, v35, v58, s6
+; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v58
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v35, v15, vcc
+; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1
+; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6
+; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc
+; GFX9-NEXT: v_perm_b32 v58, v35, v15, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX9-NEXT: v_perm_b32 v59, v16, v13, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14
-; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX9-NEXT: v_perm_b32 v55, v19, v20, s7
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_perm_b32 v57, v8, v5, s7
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v32
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v30
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v28
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_perm_b32 v36, v32, v29, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v25
-; GFX9-NEXT: v_perm_b32 v38, v22, v31, s7
-; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; GFX9-NEXT: v_perm_b32 v47, v14, v11, s7
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: v_perm_b32 v45, v12, v9, s7
+; GFX9-NEXT: v_perm_b32 v41, v10, v7, s7
+; GFX9-NEXT: v_perm_b32 v34, v6, v18, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v38
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v39
+; GFX9-NEXT: v_mov_b32_e32 v38, v54
+; GFX9-NEXT: v_perm_b32 v61, v21, v31, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v43
+; GFX9-NEXT: v_mov_b32_e32 v43, v34
+; GFX9-NEXT: v_mov_b32_e32 v39, v55
+; GFX9-NEXT: v_mov_b32_e32 v42, v33
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[38:39]
; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v31
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v49
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v48
+; GFX9-NEXT: v_mov_b32_e32 v48, v60
+; GFX9-NEXT: v_mov_b32_e32 v49, v61
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v51
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v50
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v20
-; GFX9-NEXT: v_perm_b32 v49, v19, v21, s7
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v46
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v46
-; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v43, v45, v43, vcc
-; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v15
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v45, v46, vcc
-; GFX9-NEXT: v_perm_b32 v15, v15, v43, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v44
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v43
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[15:16]
-; GFX9-NEXT: v_perm_b32 v51, v6, v18, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7
-; GFX9-NEXT: v_perm_b32 v57, v28, v60, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v53
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v44
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v52
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v41
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v42
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v60
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v61
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v39
-; GFX9-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v63
+; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v62
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[58:59]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[46:47]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[41:42]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[44:45]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[54:55]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[40:41]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[39:40]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[56:57]
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[52:53]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[50:51]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[48:49]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[42:43]
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v59
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[61:62]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[37:38]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[35:36]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v62
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[33:34]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[56:57]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[62:63]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[58:59]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[60:61]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44]
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[33:34]
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v42
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v42
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v41
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v41
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v55
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v55
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v40
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51]
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v59
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v58
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v47
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v53
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v53
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v51
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v51
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v38
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v38
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v37
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36
-; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35
-; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v57
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v57
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v56
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v56
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v50
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52
-; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v50
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v59
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v47
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v46
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v46
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v45
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v44
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v41
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v40
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v40
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v57
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v57
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v56
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v43
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v43
+; GFX9-NEXT: v_mov_b32_e32 v43, v38
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v39
+; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v43
+; GFX9-NEXT: v_mov_b32_e32 v43, v48
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v38
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v43
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v61
+; GFX9-NEXT: v_mov_b32_e32 v43, v61
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v48
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v43
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v58
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v45
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v42
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v39
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v62
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v56
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v49
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v49
+; GFX9-NEXT: v_mov_b32_e32 v54, v56
+; GFX9-NEXT: v_mov_b32_e32 v38, v58
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v43
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v44
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v44
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v59
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60
-; GFX9-NEXT: v_mov_b32_e32 v33, v60
-; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v52
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49
-; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v62
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v62
-; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v61
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v60
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v43
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v44
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v44
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v43
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v45
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v45
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v44
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v44
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v43
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v44
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v43
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v44
+; GFX9-NEXT: v_mov_b32_e32 v44, v35
+; GFX9-NEXT: v_mov_b32_e32 v35, v50
+; GFX9-NEXT: v_mov_b32_e32 v39, v51
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v63
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v62
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v62
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v49, v33
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v63
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v49
+; GFX9-NEXT: v_mov_b32_e32 v49, v47
+; GFX9-NEXT: v_mov_b32_e32 v47, v45
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v60
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v61
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v61
+; GFX9-NEXT: v_mov_b32_e32 v61, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59
; GFX9-NEXT: .LBB90_4: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v39
-; GFX9-NEXT: v_or_b32_sdwa v8, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v45
-; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v40
-; GFX9-NEXT: v_or_b32_sdwa v10, v10, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v54
-; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51
-; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48
-; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v57
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v59
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v46
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v42
-; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v41
+; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v40
; GFX9-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v62
-; GFX9-NEXT: v_or_b32_sdwa v17, v17, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v60
-; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41
-; GFX9-NEXT: v_or_b32_sdwa v18, v18, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
-; GFX9-NEXT: v_or_b32_sdwa v12, v12, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT: v_or_b32_sdwa v16, v47, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
-; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; GFX9-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
-; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v51
-; GFX9-NEXT: v_or_b32_sdwa v38, v38, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; GFX9-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v55
+; GFX9-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v53
+; GFX9-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; GFX9-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; GFX9-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v35
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v16, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v60
+; GFX9-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v56
+; GFX9-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v33
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; GFX9-NEXT: v_or_b32_sdwa v35, v37, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v36, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15
-; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v15
+; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -167428,16 +167513,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -167447,16 +167530,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -167466,18 +167549,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -167485,16 +167565,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -167504,32 +167584,32 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -167538,49 +167618,57 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50
; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v63
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55
-; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v58
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58
-; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48
; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
@@ -167588,11 +167676,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -167601,10 +167689,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -167614,23 +167702,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -167640,23 +167726,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -167666,11 +167750,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -167679,15 +167763,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -167757,117 +167838,117 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -167889,183 +167970,185 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v2.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v22.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v20.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v94.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v17.l
; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v20, 16, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v39 :: v_dual_and_b32 v34, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v17, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v55.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v36, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v71.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v34, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v34
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v37, v51, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v37, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v80.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v83.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v70
+; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v17
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v83, v33, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v86.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff
@@ -168073,264 +168156,267 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v71
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v84, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v70
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v82.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v84.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_cndmask_b32 v84, v19, v39
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v19, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v80
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v19
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v84
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v22
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v85.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v23
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v97.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v33, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v86.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v96.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v34, v36, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v34, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v96.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v100.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v87
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v24
; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v99.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v85, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v97.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v26, v38 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v26, v38 :: v_dual_and_b32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v28, 16, v28
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v103.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v26, v33, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v26, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v98
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v25, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v85
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v25, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v99.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v101.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v100
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v87
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v28
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v32, 16, v32
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v112.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v113.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v115, v34, v36 :: v_dual_add_f32 v34, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v115.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v98
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v30
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v114.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v35, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v114.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v32, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v132, v32, v33 :: v_dual_add_f32 v33, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v115
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v132.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v31, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v32
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v131.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v144, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v117
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v32
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v148, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v132
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v4, 16, 1
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
@@ -168342,179 +168428,170 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v148.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v144.h
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v147.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v34, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v162.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v163.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v39 :: v_dual_add_f32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v128
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v147.h
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v38 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v149
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 24, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v35, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v164.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v179, v33, v37 :: v_dual_add_f32 v8, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v179.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v160.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v6, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v6, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v5, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v131
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v134, v5, v38 :: v_dual_and_b32 v39, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v36, 16, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v167.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v178.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v179
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v134
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v45, v7, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v42.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v7, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v46.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v145, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v45.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v182.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v42
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v145
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v10
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v38, v50, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v43, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v59.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v74, v35, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v43.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v60, v35, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v60, v48, v52 :: v_dual_add_f32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v48, v52, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v60
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v150
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v74.h
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v60.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v12
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff
@@ -168522,73 +168599,75 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v72, v37, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39
; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v14, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 0x40c00000, v16 :: v_dual_add_f32 v13, 0x40c00000, v13
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v58, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v77.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v58.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v14, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v13
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v16, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v94, v35, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v79, v35, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v74, v13, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v37 :: v_dual_lshlrev_b32 v37, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v91, v13, v49, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v79.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v94.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v88, v39, v51, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v76
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v72.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v165
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v35, v48, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v104.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v91.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v88.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v74.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 24, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 8, v14
+; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v183
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13
@@ -168607,27 +168686,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v7
; GFX11-TRUE16-NEXT: .LBB90_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v117.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v106.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v128.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v132.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v107.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v78.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v91.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v4.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.h
@@ -168636,7 +168715,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v163.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v6
@@ -168645,15 +168724,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.l, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v95.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v93.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v104.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v95.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.h
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v180.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v90.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v179.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
@@ -168664,10 +168743,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v164.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v160.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v93.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v131.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
@@ -168676,8 +168755,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v46.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v75.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v42.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v90.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
@@ -168685,24 +168764,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v178.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v63.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v179.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v167.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v77.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v76.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v74.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v60.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v63.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v45.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v182.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v62.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v8.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
@@ -168715,25 +168794,25 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v89.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v72.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v47.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v42.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v43.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v60.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v44.l
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v94.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v167.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v59.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v182.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v79.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v43.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v45.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v14.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
@@ -168745,8 +168824,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v77.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v58.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v178.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
@@ -168754,10 +168833,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v20, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v76.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v88.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
@@ -168765,24 +168844,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v91.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v92.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v74.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v183.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v89.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v79.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v92.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v78.l
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v18
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
@@ -168791,24 +168870,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v75.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v61.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v73.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v44.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v59.l
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v9
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
@@ -168819,29 +168898,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v177.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v41.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v41.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v46.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v40.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v166.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v180.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v177.l
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
@@ -168854,16 +168933,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v151.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
@@ -168876,16 +168955,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v135.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v133.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
@@ -168900,17 +168979,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v129.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v118.l
; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
@@ -168978,128 +169057,132 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x15
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12
+; GFX11-FAKE16-NEXT: s_clause 0x19
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:12
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
@@ -169108,863 +169191,867 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_2
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[29:30]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[15:16]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[11:12]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[7:8]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[98:99], 24, v[3:4]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v13
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v13
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v11
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v11
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v9
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v9
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v4
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v32
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v32
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v32
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v31
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v31
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v30
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v30
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v30
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v28
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v27
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v26
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v26
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v26
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v25
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v25
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v24
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v24
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v24
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v23
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v23
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v22
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v22
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v22
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v21
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v21
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v20
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v20
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v20
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v17
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v17
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[31:32]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[27:28]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 24, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 24, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v78, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v88, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v89, 8, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 24, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 24, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 24, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 24, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 24, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v77, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[13:14]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[5:6]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[25:26]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[21:22]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[19:20]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
; GFX11-FAKE16-NEXT: .LBB90_2: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_4
; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v33, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v33, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: v_add3_u32 v33, v48, v36, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v35 :: v_dual_add_f32 v18, 0x40c00000, v18
; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v18, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v18
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v20
-; GFX11-FAKE16-NEXT: v_perm_b32 v69, v77, v17, 0x7060302
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v132, v37, v39 :: v_dual_lshlrev_b32 v37, 16, v20
; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v34
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v69
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v34, v132, v17, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v132
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v38, v18 :: v_dual_add_f32 v18, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v69
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v35, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v37
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v35, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v18, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v39, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v18, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-FAKE16-NEXT: v_perm_b32 v68, v34, v33, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22
-; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v39 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v18, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v20, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v39, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v68
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v68
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v34, v19, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v38, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v65, v19, v18, 0x7060302
-; GFX11-FAKE16-NEXT: v_add3_u32 v20, v39, v36, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v19
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v65
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v20, v34, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v37
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
-; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v20, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v39, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v20, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v33, v33, v35, 0x7060302
+; GFX11-FAKE16-NEXT: v_add3_u32 v20, v49, v37, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v22
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v38, v19 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v36, v48, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v49
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v77, 16, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v79, 8, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v39, v50, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v20
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v20, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v38, v19, v18, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v37, v37, v36, 0x7060302
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v20, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-FAKE16-NEXT: v_perm_b32 v64, v35, v34, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v65
-; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v38, vcc_lo
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v64
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v64
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[64:65]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v39, v50, vcc_lo
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v51, 0x40c00000, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v37
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v22, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v22
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v24
-; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v22, v48, v37, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v39, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v35, v21, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v51, 16, 1
; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_perm_b32 v71, v21, v20, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v21
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v22, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v37
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v51, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v49, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v53, v21, v20, 0x7060302
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v52, v48, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v22, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v35
-; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v22, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v48, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v48
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v39, v52 :: v_dual_add_f32 v50, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v22
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v22, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v70, v36, v35, 0x7060302
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v23
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v38, 16, 1
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26
-; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v22, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v52, v39, v48, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v49, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v24, 0x7fff
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v48, 16, 1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v24, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v24, v49, v38, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v48
-; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v48, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v70
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v24, v36, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v39
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v54, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v55, v50, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v39, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_add3_u32 v51, v49, v54, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v52
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v24, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v51, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-FAKE16-NEXT: v_perm_b32 v81, v23, v22, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v24, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v49, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v24, 0x7fff
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v24, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v80, v37, v36, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v26, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v25
+; GFX11-FAKE16-NEXT: v_perm_b32 v69, v39, v49, 0x7060302
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51
+; GFX11-FAKE16-NEXT: v_perm_b32 v70, v23, v22, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v50, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v55, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v55
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v51, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v69
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v54, v50, v55, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v26, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26
-; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v39, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v48, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v26, 0x7fff
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v26, v50, v39, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v49
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v37, v25, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v49, 0x7fff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v36
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v80
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v26, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v26, v64, v51, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v39, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v27
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v26, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v64
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v54, v65 :: v_dual_lshlrev_b32 v54, 16, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v26, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v50, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v26
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v83, v39, v50, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v28, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v26, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v82, v38, v37, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v28, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v27
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v28
-; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v39, v49, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v28, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v28, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v54, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v51, v55, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v28, v51, v48, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v50
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v38, v27, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v50, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v64, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX11-FAKE16-NEXT: v_add3_u32 v28, v65, v54, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v39, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v54
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT: v_add3_u32 v55, v51, v64, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-FAKE16-NEXT: v_perm_b32 v83, v25, v24, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v28, v38, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v49
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_perm_b32 v84, v25, v24, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v27
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v51, v28, v39 :: v_dual_add_f32 v28, 0x40c00000, v65
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 24, v84
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v28, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v51, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v28
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v28, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v84, v39, v38, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v28, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add3_u32 v54, v54, v28, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v96, v39, v51, 0x7060302
; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v30, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v29
; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v30
-; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v49, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v48, v50, vcc_lo
; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v30, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v55, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v54, v64, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v30, v52, v49, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v65
+; GFX11-FAKE16-NEXT: v_add3_u32 v30, v66, v55, 0x7fff
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v51, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v39, v29 :: v_dual_lshlrev_b32 v66, 16, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v54, v65, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v83
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v97, v27, v26, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v30, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v66
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v51
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v64, v67, vcc_lo
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v31
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v30, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v52, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v30
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v30, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v101, v39, v54, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v64
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v30, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v86, v48, v39, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v32, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v31
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v32
-; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v49, v51, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v32, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v32, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v55, v65, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v32, v53, v50, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v31, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v52, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_add3_u32 v32, v67, v64, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v39, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v64
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT: v_add3_u32 v65, v55, v66, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-FAKE16-NEXT: v_perm_b32 v85, v27, v26, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v51
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51
+; GFX11-FAKE16-NEXT: v_perm_b32 v102, v29, v28, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v67
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[101:102]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[96:97]
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v65, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_perm_b32 v112, v39, v55, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v32, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v65
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[83:84]
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v65, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v64, v66, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51
-; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v65
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v67, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v39, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v39, v71, v65, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v32, v67, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v67
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v115, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v39, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v68
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49
-; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v39, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v114, v64, v32, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v66
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v65, v67, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_perm_b32 v113, v31, v30, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 24, v115
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v64, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v71, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v68
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v67, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v67, 16, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v65, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v67
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v116, v3, v64, 0x7060302
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6
-; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1
-; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v65, v65, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v67, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v5, 16, 1
; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v68, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v80, v67, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v67
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302
-; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32
+; GFX11-FAKE16-NEXT: v_perm_b32 v119, v3, v65, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v67, v6, v68 :: v_dual_add_f32 v6, 0x40c00000, v71
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v66, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v68
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v118, v5, v67, 0x7060302
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1
-; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v68, 16, 1
+; GFX11-FAKE16-NEXT: v_perm_b32 v117, v4, v39, 0x7060302
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v66, v71, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v81, v68, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v68
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v80 :: v_dual_lshlrev_b32 v80, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v7, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v8, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v80
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v71, 16, v9
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v66, v81, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v71
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v44, v7, v68, 0x7060302
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10
-; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1
-; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v71, 16, 1
+; GFX11-FAKE16-NEXT: v_perm_b32 v45, v5, v6, 0x7060302
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v66, v80, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v82, v71, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v71
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v81, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v9, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v64
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v10, v80 :: v_dual_add_f32 v10, 0x40c00000, v81
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11
-; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v66, v82 :: v_dual_lshlrev_b32 v66, 16, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v12, 16, 1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10
-; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12
-; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_perm_b32 v182, v9, v71, 0x7060302
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v66
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v80, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add3_u32 v81, v81, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v9, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v66, v80
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9
-; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14
-; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v85, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v183, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v81, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v11, 16, 1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67
-; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v66, v80 :: v_dual_add_f32 v66, 0x40c00000, v81
+; GFX11-FAKE16-NEXT: v_add3_u32 v80, v82, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v13
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v66, 16, 1
; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112
-; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52
-; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[112:113]
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v80, v81 :: v_dual_add_f32 v80, 0x40c00000, v82
+; GFX11-FAKE16-NEXT: v_add3_u32 v81, v85, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v80, 16, 1
; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66
-; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14
-; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v87, 0x400000, v80
+; GFX11-FAKE16-NEXT: v_perm_b32 v176, v11, v9, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v81, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v81, v85, v14, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_add3_u32 v85, v86, v80, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v16
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v13, 16, 1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v81, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v81, 0x40c00000, v86
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT: v_add3_u32 v82, v98, v13, 0x7fff
; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15
+; GFX11-FAKE16-NEXT: v_perm_b32 v163, v14, v66, 0x7060302
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v81, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v85, v87, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115
-; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67
-; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT: v_perm_b32 v177, v12, v10, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v82, v85 :: v_dual_add_f32 v82, 0x40c00000, v87
+; GFX11-FAKE16-NEXT: v_add3_u32 v85, v86, v81, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v81
+; GFX11-FAKE16-NEXT: v_bfe_u32 v87, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112
-; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v82, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v99, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_perm_b32 v162, v13, v80, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v85, v86, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v86, v87, v16, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v87, 0x400000, v16
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112
-; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87]
-; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v98, v98, v82, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v80
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v86, v87, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add3_u32 v85, v85, v15, 0x7fff
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134
+; GFX11-FAKE16-NEXT: v_perm_b32 v149, v16, v81, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v98, v99, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v81
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v65
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[176:177]
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v85, v100, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[44:45]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[98:99], 24, v[116:117]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v66
+; GFX11-FAKE16-NEXT: v_perm_b32 v148, v15, v82, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v82
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v71
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v68
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[148:149]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[162:163]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[182:183]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[118:119]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[114:115]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[69:70]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[52:53]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[37:38]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 24, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v162
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v177
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v177
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 8, v176
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v183
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v183
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v45
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v45
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v44
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v44
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 24, v119
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v119
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v118
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v118
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 24, v117
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v117
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v116
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v116
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v78, 8, v115
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v88, 16, v114
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v89, 8, v114
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 24, v113
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v113
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v112
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v112
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v102
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v102
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v101
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v101
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 24, v97
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v96
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v83
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v70
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 24, v53
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v53
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v52
; GFX11-FAKE16-NEXT: .LBB90_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v89
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v99
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v76
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v88
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v73
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v37, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v78
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v180
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v98
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v66, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v72
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v59
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v58
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v56
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v161
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v47
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v45
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v43
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v85
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v42
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v146
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v41
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v183
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v55, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v81
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v179
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v55, v66
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v178
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v176
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v167
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v80
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v80, 8, v164
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v131
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v163
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v55
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v66, v68
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v81, v82
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v51
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v55
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v162
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v151
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v67
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v150
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v149
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v148
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8
; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v145
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v64
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v144
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v103
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v135
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v79
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v77
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v33, 8, v33
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v34, v33
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
@@ -169981,35 +170068,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v74
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v132
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v38
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v63
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v61
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v36
; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v60
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v160
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v57
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v52
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v46
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v53
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v147
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v44
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v33, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
@@ -170026,35 +170113,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v54
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v134
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v182
; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v83
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v177
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v50
; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v130
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v165
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v49
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v33, v34
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
@@ -170071,31 +170158,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v97
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v118
; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v48
; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27
; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v71
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v116
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v112
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v115
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v35
; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v113
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v114
; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26
; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30
@@ -170123,29 +170210,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT: s_clause 0x15
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96
+; GFX11-FAKE16-NEXT: s_clause 0x19
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:112
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
@@ -170206,464 +170297,484 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8
; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v10
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v11, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v28
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24
; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v19, 1.0, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36
; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v39
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v55
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v41
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v42
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43
-; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v42, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s27
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB91_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mov_b32_e32 v43, v36
-; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16
+; SI-NEXT: v_alignbit_b32 v36, v1, v3, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16
+; SI-NEXT: v_alignbit_b32 v33, v1, v6, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16
+; SI-NEXT: v_mov_b32_e32 v41, v28
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: v_alignbit_b32 v28, v1, v13, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16
-; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16
+; SI-NEXT: v_mov_b32_e32 v40, v25
+; SI-NEXT: v_alignbit_b32 v25, v1, v19, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16
+; SI-NEXT: v_alignbit_b32 v53, v1, v44, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16
+; SI-NEXT: v_alignbit_b32 v19, v1, v7, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57
+; SI-NEXT: v_alignbit_b32 v57, v1, v58, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16
-; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24
-; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v13, v1, v56, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; SI-NEXT: v_alignbit_b32 v10, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; SI-NEXT: v_alignbit_b32 v7, v1, v23, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39
-; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v15, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT: v_alignbit_b32 v44, v2, v8, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8
-; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16
-; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v44, v36, 24
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v44, v36, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
+; SI-NEXT: v_alignbit_b32 v58, v2, v9, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8
-; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v58, v33, 24
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v58, v33, 16
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v58, v33, 8
+; SI-NEXT: v_alignbit_b32 v47, v23, v12, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v2, v47, v28, 24
+; SI-NEXT: v_alignbit_b32 v6, v1, v27, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v2, v47, v28, 16
+; SI-NEXT: v_alignbit_b32 v5, v1, v32, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8
+; SI-NEXT: v_alignbit_b32 v2, v47, v28, 8
+; SI-NEXT: v_alignbit_b32 v4, v1, v37, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46
-; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT: v_alignbit_b32 v3, v1, v48, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50
-; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v50, v2, v46, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v50, v25, 24
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v50, v25, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
-; SI-NEXT: v_mov_b32_e32 v17, v63
-; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16
+; SI-NEXT: v_alignbit_b32 v2, v50, v25, 8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v56, v1, v51, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52
+; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v45
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; SI-NEXT: v_alignbit_b32 v35, v22, v17, 16
+; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v17, v40
+; SI-NEXT: v_mov_b32_e32 v40, v29
+; SI-NEXT: v_mov_b32_e32 v32, v49
; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31
-; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v52, v44, v36, 8
+; SI-NEXT: v_alignbit_b32 v46, v35, v13, 8
+; SI-NEXT: v_mov_b32_e32 v48, v34
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11
+; SI-NEXT: v_alignbit_b32 v62, v2, v61, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v62, v53, 24
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v62, v53, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56
-; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v62, v53, 8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60
+; SI-NEXT: v_alignbit_b32 v55, v2, v63, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v55, v19, 24
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8
-; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v55, v19, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v55, v19, 8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16
-; SI-NEXT: v_mov_b32_e32 v45, v8
-; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8
-; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24
-; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59
+; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v62
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23
-; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24
+; SI-NEXT: v_alignbit_b32 v27, v8, v26, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v44
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_alignbit_b32 v38, v51, v2, 16
+; SI-NEXT: v_alignbit_b32 v2, v38, v57, 24
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v38, v57, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v38, v57, 8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; SI-NEXT: v_alignbit_b32 v37, v8, v18, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32
-; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16
+; SI-NEXT: v_alignbit_b32 v8, v37, v6, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v37, v6, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v37, v6, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17
+; SI-NEXT: v_alignbit_b32 v21, v8, v29, 16
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v21, v5, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v21, v5, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34
-; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v21, v5, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v18, v9
+; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29
+; SI-NEXT: v_alignbit_b32 v61, v8, v34, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v61, v4, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8
+; SI-NEXT: v_alignbit_b32 v8, v61, v4, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28
-; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v61, v4, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v41
+; SI-NEXT: v_alignbit_b32 v63, v8, v49, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33
+; SI-NEXT: v_alignbit_b32 v8, v63, v3, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8
-; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v63, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v31
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v63, v3, 8
+; SI-NEXT: v_alignbit_b32 v12, v49, v54, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v12, v56, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8
+; SI-NEXT: v_alignbit_b32 v8, v12, v56, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42
-; SI-NEXT: v_mov_b32_e32 v15, v9
-; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v12, v56, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_alignbit_b32 v9, v8, v43, 16
+; SI-NEXT: v_mov_b32_e32 v43, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v24
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v2
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v37
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v8, v37
-; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v43
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46
-; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v21
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v29
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v61
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59
-; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v42
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v41
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v63
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42
-; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v60
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v31
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v20, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v45
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9
-; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16
-; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v12
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58
-; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v15
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55
-; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, v28
-; SI-NEXT: v_mov_b32_e32 v23, v48
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63
-; SI-NEXT: v_mov_b32_e32 v48, v33
-; SI-NEXT: v_mov_b32_e32 v34, v53
-; SI-NEXT: v_mov_b32_e32 v53, v42
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v9
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v24, v51
+; SI-NEXT: v_mov_b32_e32 v51, v2
+; SI-NEXT: v_mov_b32_e32 v2, v22
+; SI-NEXT: v_mov_b32_e32 v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v58
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v47
+; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v50
+; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v55
+; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v38
+; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v35
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30
+; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v27
+; SI-NEXT: v_mov_b32_e32 v15, v43
+; SI-NEXT: v_mov_b32_e32 v43, v39
; SI-NEXT: s_branch .LBB91_3
; SI-NEXT: .LBB91_2:
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v40, v29
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -170806,139 +170917,116 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr9
; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: v_mov_b32_e32 v53, v42
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: v_mov_b32_e32 v48, v33
-; SI-NEXT: v_mov_b32_e32 v29, v28
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: v_mov_b32_e32 v17, v63
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v43, v39
+; SI-NEXT: v_mov_b32_e32 v32, v49
+; SI-NEXT: v_mov_b32_e32 v48, v34
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v15, v25
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr9
; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; kill: killed $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr9
; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; kill: killed $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; kill: killed $vgpr56
+; SI-NEXT: ; kill: killed $vgpr17
+; SI-NEXT: ; kill: killed $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; kill: killed $vgpr17
; SI-NEXT: .LBB91_3: ; %Flow
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v56, v17
-; SI-NEXT: v_mov_b32_e32 v54, v61
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v42, v32
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_cbranch_vccnz .LBB91_5
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v44
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16
+; SI-NEXT: v_alignbit_b32 v56, v3, v2, 16
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
@@ -170949,941 +171037,960 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33
+; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16
+; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v43
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26
+; SI-NEXT: v_alignbit_b32 v5, v7, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11
-; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11
+; SI-NEXT: v_alignbit_b32 v12, v59, v7, 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16
+; SI-NEXT: v_alignbit_b32 v6, v10, v6, 16
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v14
; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v48
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32
-; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_alignbit_b32 v10, v16, v10, 16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v17
+; SI-NEXT: v_alignbit_b32 v61, v16, v13, 16
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v41
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v17
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_alignbit_b32 v13, v19, v13, 16
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20
+; SI-NEXT: v_alignbit_b32 v21, v19, v16, 16
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v51
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v37, v2, v19, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22
+; SI-NEXT: v_alignbit_b32 v57, v22, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v37
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v21
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v61
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v63
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v12
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v9
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v19
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v19
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v27, v2, v22, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_alignbit_b32 v19, v25, v19, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40
-; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v30, v2, v25, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28
+; SI-NEXT: v_alignbit_b32 v53, v28, v22, 16
+; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v25
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v25
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT: v_alignbit_b32 v35, v2, v28, 16
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_alignbit_b32 v25, v32, v25, 16
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v35
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v52
+; SI-NEXT: v_alignbit_b32 v38, v24, v32, 16
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v38
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41
-; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v28, v33, v28, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v33
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v54
+; SI-NEXT: v_alignbit_b32 v55, v18, v32, 16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43
-; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_alignbit_b32 v33, v36, v33, 16
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46
+; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v36
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41
+; SI-NEXT: v_alignbit_b32 v62, v15, v32, 16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42
-; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56
-; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39
-; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50
-; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42
-; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46
-; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v41
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43
-; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v54
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41
-; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v52
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40
-; SI-NEXT: v_mov_b32_e32 v40, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54
-; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v51
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v49
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v39
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23
-; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v31
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49
-; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v20
+; SI-NEXT: v_mov_b32_e32 v49, v59
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v62
+; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v55
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v27
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v23
+; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; SI-NEXT: v_alignbit_b32 v36, v48, v36, 16
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v40
+; SI-NEXT: v_alignbit_b32 v50, v15, v32, 16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v46, v35, v13, 8
+; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v50
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v48
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v42
+; SI-NEXT: v_alignbit_b32 v47, v22, v32, 16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v48
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43
+; SI-NEXT: v_alignbit_b32 v58, v23, v32, 16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26
-; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40
+; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v58
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_alignbit_b32 v44, v16, v32, 16
; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v58, v33, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8
+; SI-NEXT: v_alignbit_b32 v8, v58, v33, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v58, v33, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v47, v28, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8
+; SI-NEXT: v_alignbit_b32 v8, v47, v28, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v47, v28, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16
+; SI-NEXT: v_alignbit_b32 v8, v50, v25, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8
+; SI-NEXT: v_alignbit_b32 v8, v50, v25, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v50, v25, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16
+; SI-NEXT: v_alignbit_b32 v8, v62, v53, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8
+; SI-NEXT: v_alignbit_b32 v8, v62, v53, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v62, v53, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v55, v19, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v55, v19, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v55, v19, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8
+; SI-NEXT: v_alignbit_b32 v8, v38, v57, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v38, v57, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v38, v57, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v37, v6, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v37, v6, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v37, v6, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16
+; SI-NEXT: v_alignbit_b32 v8, v21, v5, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v21, v5, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v21, v5, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12
-; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v61, v4, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21
+; SI-NEXT: v_alignbit_b32 v8, v61, v4, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v61, v4, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v63, v3, 24
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v63, v3, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v63, v3, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v8, v12, v56, 24
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8
+; SI-NEXT: v_alignbit_b32 v8, v12, v56, 16
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v8, v12, v56, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v52, v44, v36, 8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v44
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27
+; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v47
; SI-NEXT: .LBB91_5: ; %end
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v36, 0xff, v36
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v52
+; SI-NEXT: v_or_b32_e32 v36, v36, v52
+; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v26
+; SI-NEXT: v_and_b32_e32 v33, 0xff, v33
+; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v34
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28
-; SI-NEXT: v_or_b32_e32 v32, v36, v32
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v29
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; SI-NEXT: v_and_b32_e32 v52, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29
-; SI-NEXT: v_or_b32_e32 v36, v56, v36
-; SI-NEXT: v_or_b32_e32 v32, v32, v36
-; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xff, v44
-; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51
-; SI-NEXT: v_or_b32_e32 v32, v32, v36
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v19
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT: v_or_b32_e32 v14, v14, v36
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; SI-NEXT: v_or_b32_e32 v14, v32, v14
-; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v17
+; SI-NEXT: v_or_b32_e32 v51, v51, v52
+; SI-NEXT: v_or_b32_e32 v36, v36, v51
+; SI-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v14, v14, v32
-; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xff, v44
+; SI-NEXT: v_or_b32_e32 v36, v36, v39
+; SI-NEXT: v_and_b32_e32 v39, 0xff, v16
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xff, v19
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v15
+; SI-NEXT: v_or_b32_e32 v39, v51, v39
+; SI-NEXT: v_or_b32_e32 v36, v36, v39
+; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v17, v33, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19
-; SI-NEXT: v_or_b32_e32 v32, v33, v32
-; SI-NEXT: v_or_b32_e32 v14, v14, v32
-; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v58
-; SI-NEXT: v_or_b32_e32 v11, v14, v11
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v33, 0xff, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v26
+; SI-NEXT: v_or_b32_e32 v33, v36, v33
+; SI-NEXT: v_or_b32_e32 v17, v17, v33
+; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v17, v33, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v58
+; SI-NEXT: v_or_b32_e32 v17, v17, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xff, v23
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6
-; SI-NEXT: v_or_b32_e32 v14, v32, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15
+; SI-NEXT: v_or_b32_e32 v31, v32, v31
+; SI-NEXT: v_or_b32_e32 v17, v17, v31
+; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v60
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v28, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v17, v17, v28
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xff, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v23
+; SI-NEXT: v_or_b32_e32 v28, v31, v28
+; SI-NEXT: v_or_b32_e32 v17, v17, v28
+; SI-NEXT: v_add_i32_e32 v28, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v17, v28, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v47
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v25
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v47
+; SI-NEXT: v_or_b32_e32 v8, v17, v8
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v22
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2
-; SI-NEXT: v_or_b32_e32 v14, v28, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v15
+; SI-NEXT: v_or_b32_e32 v17, v28, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v57
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19
+; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v25, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v25, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v50
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v50
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v59
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2
-; SI-NEXT: v_or_b32_e32 v14, v25, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17
-; SI-NEXT: v_or_b32_e32 v14, v22, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v15
+; SI-NEXT: v_or_b32_e32 v17, v25, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v62
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v53
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17
-; SI-NEXT: v_or_b32_e32 v14, v22, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v22, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v8, v17, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v34
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v62
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v19, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v55
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v61
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17
-; SI-NEXT: v_or_b32_e32 v14, v19, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v41
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54
-; SI-NEXT: v_or_b32_e32 v14, v16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT: v_or_b32_e32 v11, v17, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v38
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v19
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
+; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v16, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20
-; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v17, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v55
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v20
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v35
-; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15
-; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v45
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT: v_or_b32_e32 v11, v17, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v57
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_or_b32_e32 v11, v13, v11
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v11, v16, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v30
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v38
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v60
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v24
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
+; SI-NEXT: v_or_b32_e32 v11, v16, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v13
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v46
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13
; SI-NEXT: v_or_b32_e32 v11, v13, v11
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; SI-NEXT: v_or_b32_e32 v7, v7, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v35
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v45
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v2
+; SI-NEXT: v_or_b32_e32 v11, v13, v11
+; SI-NEXT: v_or_b32_e32 v8, v8, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v10
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v2
+; SI-NEXT: v_or_b32_e32 v10, v11, v10
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v30
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v14
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v2
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_or_b32_e32 v7, v7, v10
-; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v2
+; SI-NEXT: v_or_b32_e32 v8, v10, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xff, v27
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v18
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v2
; SI-NEXT: v_or_b32_e32 v8, v10, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v6, v6, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v26
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2
; SI-NEXT: v_or_b32_e32 v7, v8, v7
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v24
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v6, v6, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2
; SI-NEXT: v_or_b32_e32 v7, v8, v7
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v2
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xff, v21
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2
; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v2
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v52
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v2
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v18
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v61
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v2
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xff, v63
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v56
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -171894,14 +172001,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v12
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v40
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v49
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
@@ -171911,12 +172018,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -171927,10 +172034,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v9
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
@@ -171968,8 +172075,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v63, s30, 0
; VI-NEXT: v_writelane_b32 v63, s31, 1
@@ -172774,26 +172881,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -172804,126 +172911,129 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v4
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v3
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2
; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24]
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7
+; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v5
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v18
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v17
+; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15
; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15
; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14
-; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14
; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -172936,23 +173046,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v6
+; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: s_branch .LBB91_5
; VI-NEXT: .LBB91_3:
; VI-NEXT: ; implicit-def: $sgpr46
@@ -173112,23 +173220,29 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: s_branch .LBB91_2
; VI-NEXT: .LBB91_4:
; VI-NEXT: v_mov_b32_e32 v33, s71
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s69
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s70
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s68
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s67
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s86
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s83
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s66
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s64
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s65
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s54
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s53
; VI-NEXT: v_mov_b32_e32 v31, s4
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s82
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s86
; VI-NEXT: v_readlane_b32 s4, v62, 0
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s4
; VI-NEXT: v_readlane_b32 s4, v62, 1
; VI-NEXT: v_mov_b32_e32 v40, s4
@@ -173160,171 +173274,170 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s4, v62, 13
; VI-NEXT: v_mov_b32_e32 v46, s4
; VI-NEXT: v_readlane_b32 s4, v62, 14
-; VI-NEXT: v_mov_b32_e32 v50, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 15
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s4
+; VI-NEXT: v_readlane_b32 s4, v62, 15
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, s4
; VI-NEXT: v_readlane_b32 s4, v62, 16
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s4
; VI-NEXT: v_readlane_b32 s4, v62, 17
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s4
; VI-NEXT: v_readlane_b32 s4, v62, 18
+; VI-NEXT: v_mov_b32_e32 v36, s4
+; VI-NEXT: v_readlane_b32 s4, v62, 19
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 19
-; VI-NEXT: v_mov_b32_e32 v55, s4
; VI-NEXT: v_readlane_b32 s4, v62, 20
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: v_mov_b32_e32 v52, s4
; VI-NEXT: v_readlane_b32 s4, v62, 21
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: v_mov_b32_e32 v49, s4
; VI-NEXT: v_readlane_b32 s4, v62, 22
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v49, s4
; VI-NEXT: v_readlane_b32 s4, v62, 23
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 24
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 25
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 26
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 27
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 28
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 29
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 30
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 31
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 32
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 33
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 34
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 35
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 36
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 37
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 38
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 39
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 40
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 41
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 42
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 43
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 44
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 45
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 46
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 47
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 48
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 49
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 50
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 51
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 52
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 53
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 54
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 55
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 56
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: v_mov_b32_e32 v42, s54
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s4
+; VI-NEXT: v_mov_b32_e32 v42, s51
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v41, s46
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v41, s56
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s58
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v41, s58
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v41, s60
; VI-NEXT: v_mov_b32_e32 v45, s72
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v45, s74
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v45, s76
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v45, s78
; VI-NEXT: v_mov_b32_e32 v55, s88
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v36, s66
-; VI-NEXT: v_mov_b32_e32 v52, s64
-; VI-NEXT: v_mov_b32_e32 v55, v50
-; VI-NEXT: v_mov_b32_e32 v35, s30
-; VI-NEXT: v_mov_b32_e32 v59, s87
-; VI-NEXT: v_mov_b32_e32 v58, s34
-; VI-NEXT: v_mov_b32_e32 v45, s36
+; VI-NEXT: v_mov_b32_e32 v61, s50
+; VI-NEXT: v_mov_b32_e32 v58, s83
+; VI-NEXT: v_mov_b32_e32 v55, v48
+; VI-NEXT: v_mov_b32_e32 v48, v47
+; VI-NEXT: v_mov_b32_e32 v57, s30
+; VI-NEXT: v_mov_b32_e32 v35, s83
+; VI-NEXT: v_mov_b32_e32 v60, s34
+; VI-NEXT: v_mov_b32_e32 v42, s36
; VI-NEXT: v_mov_b32_e32 v34, s38
; VI-NEXT: v_mov_b32_e32 v1, s44
; VI-NEXT: v_mov_b32_e32 v2, s45
@@ -173357,44 +173470,44 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_mov_b32_e32 v29, s28
; VI-NEXT: v_mov_b32_e32 v30, s29
; VI-NEXT: v_mov_b32_e32 v32, s5
+; VI-NEXT: v_mov_b32_e32 v38, s87
+; VI-NEXT: v_mov_b32_e32 v37, s82
; VI-NEXT: v_mov_b32_e32 v41, s62
-; VI-NEXT: v_mov_b32_e32 v57, s81
-; VI-NEXT: v_mov_b32_e32 v37, s84
-; VI-NEXT: v_mov_b32_e32 v60, s52
-; VI-NEXT: v_mov_b32_e32 v38, s51
-; VI-NEXT: v_mov_b32_e32 v61, s65
-; VI-NEXT: v_mov_b32_e32 v49, s66
-; VI-NEXT: v_mov_b32_e32 v39, s55
+; VI-NEXT: v_mov_b32_e32 v59, s84
+; VI-NEXT: v_mov_b32_e32 v39, s51
; VI-NEXT: v_mov_b32_e32 v50, v46
-; VI-NEXT: v_mov_b32_e32 v46, v48
-; VI-NEXT: v_mov_b32_e32 v48, v47
; VI-NEXT: v_mov_b32_e32 v47, v56
; VI-NEXT: v_mov_b32_e32 v56, v51
; VI-NEXT: v_mov_b32_e32 v51, s90
; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v35, s85
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v34, s48
; VI-NEXT: v_mov_b32_e32 v51, v53
; VI-NEXT: v_mov_b32_e32 v53, v54
; VI-NEXT: v_mov_b32_e32 v54, v40
; VI-NEXT: v_mov_b32_e32 v40, s80
-; VI-NEXT: v_mov_b32_e32 v58, s50
-; VI-NEXT: v_mov_b32_e32 v45, s53
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v57, s81
+; VI-NEXT: v_mov_b32_e32 v58, s85
+; VI-NEXT: v_mov_b32_e32 v60, s50
+; VI-NEXT: v_mov_b32_e32 v61, s52
+; VI-NEXT: v_mov_b32_e32 v42, s55
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: .LBB91_5: ; %end
-; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v36
; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v52
+; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_readlane_b32 s87, v63, 31
; VI-NEXT: v_readlane_b32 s86, v63, 30
; VI-NEXT: v_readlane_b32 s85, v63, 29
@@ -173427,44 +173540,35 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s34, v63, 2
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v45
; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v49
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173475,23 +173579,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173502,20 +173606,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173528,21 +173632,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173553,23 +173657,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173580,23 +173684,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173607,23 +173711,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -173634,90 +173738,95 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0
; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0
; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45
-; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v38
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37
; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57
; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -173771,22 +173880,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -173812,8 +173923,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -173876,8 +173987,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_readfirstlane_b32 s59, v14
; GFX9-NEXT: v_readfirstlane_b32 s56, v15
; GFX9-NEXT: v_readfirstlane_b32 s57, v16
-; GFX9-NEXT: v_readfirstlane_b32 s46, v17
-; GFX9-NEXT: v_readfirstlane_b32 s47, v18
+; GFX9-NEXT: v_readfirstlane_b32 s44, v17
+; GFX9-NEXT: v_readfirstlane_b32 s45, v18
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
@@ -173992,51 +174103,51 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_writelane_b32 v62, s41, 3
; GFX9-NEXT: s_lshr_b64 s[40:41], s[22:23], 24
; GFX9-NEXT: v_writelane_b32 v62, s40, 0
-; GFX9-NEXT: s_lshr_b32 s70, s47, 24
-; GFX9-NEXT: s_lshr_b32 s15, s47, 16
-; GFX9-NEXT: s_lshr_b32 s7, s47, 8
-; GFX9-NEXT: s_lshr_b32 s53, s46, 16
-; GFX9-NEXT: s_lshr_b32 s52, s46, 8
-; GFX9-NEXT: s_lshr_b32 s67, s57, 24
+; GFX9-NEXT: s_lshr_b32 s53, s45, 24
+; GFX9-NEXT: s_lshr_b32 s15, s45, 16
+; GFX9-NEXT: s_lshr_b32 s70, s45, 8
+; GFX9-NEXT: s_lshr_b32 s7, s44, 16
+; GFX9-NEXT: s_lshr_b32 s6, s44, 8
+; GFX9-NEXT: s_lshr_b32 s65, s57, 24
; GFX9-NEXT: s_lshr_b32 s14, s57, 16
-; GFX9-NEXT: s_lshr_b32 s69, s57, 8
-; GFX9-NEXT: s_lshr_b32 s6, s56, 16
-; GFX9-NEXT: s_lshr_b32 s71, s56, 8
-; GFX9-NEXT: s_lshr_b32 s64, s59, 24
+; GFX9-NEXT: s_lshr_b32 s68, s57, 8
+; GFX9-NEXT: s_lshr_b32 s69, s56, 16
+; GFX9-NEXT: s_lshr_b32 s38, s56, 8
+; GFX9-NEXT: s_lshr_b32 s54, s59, 24
; GFX9-NEXT: s_lshr_b32 s13, s59, 16
-; GFX9-NEXT: s_lshr_b32 s66, s59, 8
-; GFX9-NEXT: s_lshr_b32 s51, s58, 16
-; GFX9-NEXT: s_lshr_b32 s68, s58, 8
-; GFX9-NEXT: s_lshr_b32 s99, s61, 24
+; GFX9-NEXT: s_lshr_b32 s52, s59, 8
+; GFX9-NEXT: s_lshr_b32 s67, s58, 16
+; GFX9-NEXT: s_lshr_b32 s66, s58, 8
+; GFX9-NEXT: s_lshr_b32 s97, s61, 24
; GFX9-NEXT: s_lshr_b32 s12, s61, 16
-; GFX9-NEXT: s_lshr_b32 s55, s61, 8
-; GFX9-NEXT: s_lshr_b32 s50, s60, 16
-; GFX9-NEXT: s_lshr_b32 s65, s60, 8
-; GFX9-NEXT: s_lshr_b32 s96, s63, 24
+; GFX9-NEXT: s_lshr_b32 s51, s61, 8
+; GFX9-NEXT: s_lshr_b32 s64, s60, 16
+; GFX9-NEXT: s_lshr_b32 s55, s60, 8
+; GFX9-NEXT: s_lshr_b32 s86, s63, 24
; GFX9-NEXT: s_lshr_b32 s11, s63, 16
-; GFX9-NEXT: s_lshr_b32 s98, s63, 8
-; GFX9-NEXT: s_lshr_b32 s49, s62, 16
-; GFX9-NEXT: s_lshr_b32 s54, s62, 8
-; GFX9-NEXT: s_lshr_b32 s85, s73, 24
+; GFX9-NEXT: s_lshr_b32 s50, s63, 8
+; GFX9-NEXT: s_lshr_b32 s99, s62, 16
+; GFX9-NEXT: s_lshr_b32 s98, s62, 8
+; GFX9-NEXT: s_lshr_b32 s83, s73, 24
; GFX9-NEXT: s_lshr_b32 s10, s73, 16
-; GFX9-NEXT: s_lshr_b32 s87, s73, 8
-; GFX9-NEXT: s_lshr_b32 s48, s72, 16
-; GFX9-NEXT: s_lshr_b32 s97, s72, 8
-; GFX9-NEXT: s_lshr_b32 s82, s75, 24
+; GFX9-NEXT: s_lshr_b32 s49, s73, 8
+; GFX9-NEXT: s_lshr_b32 s96, s72, 16
+; GFX9-NEXT: s_lshr_b32 s87, s72, 8
+; GFX9-NEXT: s_lshr_b32 s80, s75, 24
; GFX9-NEXT: s_lshr_b32 s9, s75, 16
-; GFX9-NEXT: s_lshr_b32 s84, s75, 8
-; GFX9-NEXT: s_lshr_b32 s39, s74, 16
-; GFX9-NEXT: s_lshr_b32 s86, s74, 8
-; GFX9-NEXT: s_lshr_b32 s80, s77, 24
+; GFX9-NEXT: s_lshr_b32 s48, s75, 8
+; GFX9-NEXT: s_lshr_b32 s85, s74, 16
+; GFX9-NEXT: s_lshr_b32 s84, s74, 8
+; GFX9-NEXT: s_lshr_b32 s71, s77, 24
; GFX9-NEXT: s_lshr_b32 s8, s77, 16
-; GFX9-NEXT: s_lshr_b32 s81, s77, 8
-; GFX9-NEXT: s_lshr_b32 s38, s76, 16
-; GFX9-NEXT: s_lshr_b32 s83, s76, 8
+; GFX9-NEXT: s_lshr_b32 s39, s77, 8
+; GFX9-NEXT: s_lshr_b32 s82, s76, 16
+; GFX9-NEXT: s_lshr_b32 s81, s76, 8
; GFX9-NEXT: v_writelane_b32 v62, s41, 1
; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
-; GFX9-NEXT: s_lshr_b64 s[78:79], s[46:47], 24
+; GFX9-NEXT: s_lshr_b64 s[46:47], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24
; GFX9-NEXT: s_lshr_b64 s[88:89], s[56:57], 24
; GFX9-NEXT: s_lshr_b64 s[90:91], s[58:59], 24
; GFX9-NEXT: s_lshr_b64 s[92:93], s[60:61], 24
@@ -174047,698 +174158,697 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_cbranch_execnz .LBB91_4
; GFX9-NEXT: .LBB91_2: ; %cmp.true
; GFX9-NEXT: s_and_b32 s6, s77, 0xffff0000
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-NEXT: v_add_f32_e32 v1, s6, v5
+; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
+; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
+; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: s_lshl_b32 s6, s77, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_add_f32_e32 v1, s6, v5
+; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
+; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2
+; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: s_and_b32 s6, s76, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v8, v5, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v1
+; GFX9-NEXT: v_add_f32_e32 v1, s6, v5
+; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v3, v3, v1
+; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: s_lshl_b32 s6, s76, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: v_add_f32_e32 v3, s6, v5
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-NEXT: s_and_b32 s6, s75, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v7, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT: v_add_f32_e32 v3, s6, v5
+; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
; GFX9-NEXT: s_lshl_b32 s6, s75, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v31
-; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v14, v5, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_lshl_b32 s6, s74, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_add_f32_e32 v3, s6, v5
; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v33
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v32
+; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000
+; GFX9-NEXT: v_lshl_or_b32 v4, v7, 16, v3
+; GFX9-NEXT: v_add_f32_e32 v3, s6, v5
+; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v6, v6, v3
+; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6
+; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT: s_lshl_b32 s6, s74, 16
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v34
; GFX9-NEXT: s_and_b32 s6, s73, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v13, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: s_lshl_b32 s6, s73, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v35
; GFX9-NEXT: s_and_b32 s6, s72, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v16, v32, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_lshl_b32 s6, s72, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v36
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_add_f32_e32 v7, s6, v5
+; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v8, v8, v7
+; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8
+; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v37
; GFX9-NEXT: s_and_b32 s6, s63, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v15, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_lshl_or_b32 v9, v6, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: s_lshl_b32 s6, s63, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v37
+; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v38
; GFX9-NEXT: s_and_b32 s6, s62, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshl_or_b32 v16, v36, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_lshl_b32 s6, s62, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v39
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_add_f32_e32 v7, s6, v5
+; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v8, v8, v7
+; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8
+; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v48
; GFX9-NEXT: s_and_b32 s6, s61, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v17, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_lshl_or_b32 v15, v6, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: s_lshl_b32 s6, s61, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v48
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v49
; GFX9-NEXT: s_and_b32 s6, s60, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshl_or_b32 v21, v39, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_lshl_b32 s6, s60, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v50
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_add_f32_e32 v7, s6, v5
+; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v8, v8, v7
+; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8
+; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v51
; GFX9-NEXT: s_and_b32 s6, s59, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v19, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_lshl_or_b32 v20, v6, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: s_lshl_b32 s6, s59, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v51
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v52
; GFX9-NEXT: s_and_b32 s6, s58, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_lshl_b32 s6, s58, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v53
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_add_f32_e32 v7, s6, v5
+; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v8, v8, v7
+; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8
+; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v54
; GFX9-NEXT: s_and_b32 s6, s57, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v21, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT: v_lshl_or_b32 v22, v6, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX9-NEXT: s_lshl_b32 s6, s57, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v54
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v55
; GFX9-NEXT: s_and_b32 s6, s56, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_lshl_or_b32 v25, v53, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_lshl_b32 s6, s56, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v40
-; GFX9-NEXT: s_and_b32 s6, s47, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v23, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: s_lshl_b32 s6, s47, 16
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v41
-; GFX9-NEXT: s_and_b32 s6, s46, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v2
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_lshl_b32 s6, s46, 16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_add_f32_e32 v3, s6, v1
-; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v42
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_add_f32_e32 v7, s6, v5
+; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v8, v8, v7
+; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8
+; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v41
+; GFX9-NEXT: s_and_b32 s6, s45, 0xffff0000
+; GFX9-NEXT: v_lshl_or_b32 v24, v6, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: s_lshl_b32 s6, s45, 16
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v42
+; GFX9-NEXT: s_and_b32 s6, s44, 0xffff0000
+; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v6
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v7, v7, v6
+; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7
+; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT: s_lshl_b32 s6, s44, 16
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT: v_add_f32_e32 v7, s6, v5
+; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX9-NEXT: v_add_u32_e32 v8, v8, v7
+; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8
+; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v43
; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000
-; GFX9-NEXT: v_lshl_or_b32 v25, v2, 16, v3
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_lshl_or_b32 v26, v6, 16, v7
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s11, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s17, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s17, s6, 16
; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s16, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s16, s6, 16
; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s44, s16, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s12, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s19, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s19, s6, 16
; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s18, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s18, s6, 16
; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s13, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s21, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s21, s6, 16
; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s20, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s20, s6, 16
; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s14, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s23, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s23, s6, 16
; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s22, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s22, s6, 16
; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s15, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s25, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s25, s6, 16
; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s24, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s24, s6, 16
; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s76, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s27, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s27, s6, 16
; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s26, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s26, s6, 16
; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s77, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s29, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s29, s6, 16
; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_lshl_b32 s6, s28, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s10, s9
; GFX9-NEXT: s_lshr_b32 s28, s6, 16
; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
; GFX9-NEXT: s_add_i32 s7, s7, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8
; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s9, s8
; GFX9-NEXT: s_lshl_b32 s5, s5, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s5, v1
-; GFX9-NEXT: v_readfirstlane_b32 s5, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s5, v5
+; GFX9-NEXT: v_readfirstlane_b32 s5, v6
; GFX9-NEXT: s_lshr_b32 s78, s6, 16
; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010
; GFX9-NEXT: s_add_i32 s6, s6, s5
; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff
; GFX9-NEXT: s_bitset1_b32 s5, 22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
; GFX9-NEXT: s_cselect_b32 s5, s5, s8
; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: v_add_f32_e32 v6, s6, v5
+; GFX9-NEXT: v_readfirstlane_b32 s6, v6
; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010
; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_lshr_b32 s5, s5, 16
; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff
; GFX9-NEXT: s_bitset1_b32 s6, 22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_lshl_b32 s4, s4, 16
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_add_f32_e32 v5, s4, v5
+; GFX9-NEXT: v_readfirstlane_b32 s4, v5
; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010
; GFX9-NEXT: s_add_i32 s8, s8, s4
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff
; GFX9-NEXT: s_bitset1_b32 s4, 22
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26]
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[23:24]
; GFX9-NEXT: s_cselect_b32 s4, s4, s10
-; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[21:22]
-; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16]
-; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s11
+; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[26:27]
+; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[20:21]
+; GFX9-NEXT: s_pack_ll_b32_b16 s45, s17, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s12
; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s13
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20]
-; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14]
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], 24, v[24:25]
+; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[15:16]
+; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[3:4]
; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s14
; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s15
; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s76
@@ -174747,9 +174857,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s6
; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24
; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24
-; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[17:18]
-; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8]
+; GFX9-NEXT: s_lshr_b64 s[46:47], s[44:45], 24
+; GFX9-NEXT: v_lshrrev_b64 v[7:8], 24, v[22:23]
+; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[9:10]
+; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[1:2]
; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24
; GFX9-NEXT: s_lshr_b64 s[36:37], s[74:75], 24
; GFX9-NEXT: s_lshr_b64 s[38:39], s[72:73], 24
@@ -174759,7 +174870,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_lshr_b32 s10, s7, 8
; GFX9-NEXT: s_lshr_b32 s41, s6, 16
; GFX9-NEXT: s_lshr_b32 s43, s6, 8
-; GFX9-NEXT: s_lshr_b32 s45, s75, 24
+; GFX9-NEXT: s_lshr_b32 s47, s75, 24
; GFX9-NEXT: s_lshr_b32 s75, s75, 8
; GFX9-NEXT: s_lshr_b32 s79, s74, 16
; GFX9-NEXT: s_lshr_b32 s74, s74, 8
@@ -174783,42 +174894,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_lshr_b32 s57, s57, 8
; GFX9-NEXT: s_lshr_b32 vcc_hi, s56, 16
; GFX9-NEXT: s_lshr_b32 s56, s56, 8
-; GFX9-NEXT: s_lshr_b32 s30, s47, 24
-; GFX9-NEXT: s_lshr_b32 s47, s47, 8
-; GFX9-NEXT: s_lshr_b32 s8, s46, 16
-; GFX9-NEXT: s_lshr_b32 s7, s46, 8
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX9-NEXT: s_lshr_b32 s30, s45, 24
+; GFX9-NEXT: s_lshr_b32 s45, s45, 8
+; GFX9-NEXT: s_lshr_b32 s8, s44, 16
+; GFX9-NEXT: s_lshr_b32 s7, s44, 8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v25
; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v25
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v23
; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v22
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v21
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v14
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v28, 24, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX9-NEXT: s_branch .LBB91_5
; GFX9-NEXT: .LBB91_3:
; GFX9-NEXT: ; implicit-def: $sgpr6
@@ -174833,46 +174944,46 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: ; implicit-def: $sgpr6
; GFX9-NEXT: ; kill: killed $sgpr6
; GFX9-NEXT: ; implicit-def: $sgpr78
-; GFX9-NEXT: ; implicit-def: $sgpr83
-; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr81
-; GFX9-NEXT: ; implicit-def: $sgpr8
-; GFX9-NEXT: ; implicit-def: $sgpr80
-; GFX9-NEXT: ; implicit-def: $sgpr86
+; GFX9-NEXT: ; implicit-def: $sgpr82
; GFX9-NEXT: ; implicit-def: $sgpr39
+; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr71
; GFX9-NEXT: ; implicit-def: $sgpr84
-; GFX9-NEXT: ; implicit-def: $sgpr9
-; GFX9-NEXT: ; implicit-def: $sgpr82
-; GFX9-NEXT: ; implicit-def: $sgpr97
+; GFX9-NEXT: ; implicit-def: $sgpr85
; GFX9-NEXT: ; implicit-def: $sgpr48
+; GFX9-NEXT: ; implicit-def: $sgpr9
+; GFX9-NEXT: ; implicit-def: $sgpr80
; GFX9-NEXT: ; implicit-def: $sgpr87
-; GFX9-NEXT: ; implicit-def: $sgpr10
-; GFX9-NEXT: ; implicit-def: $sgpr85
-; GFX9-NEXT: ; implicit-def: $sgpr54
+; GFX9-NEXT: ; implicit-def: $sgpr96
; GFX9-NEXT: ; implicit-def: $sgpr49
+; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr83
; GFX9-NEXT: ; implicit-def: $sgpr98
-; GFX9-NEXT: ; implicit-def: $sgpr11
-; GFX9-NEXT: ; implicit-def: $sgpr96
-; GFX9-NEXT: ; implicit-def: $sgpr65
+; GFX9-NEXT: ; implicit-def: $sgpr99
; GFX9-NEXT: ; implicit-def: $sgpr50
+; GFX9-NEXT: ; implicit-def: $sgpr11
+; GFX9-NEXT: ; implicit-def: $sgpr86
; GFX9-NEXT: ; implicit-def: $sgpr55
-; GFX9-NEXT: ; implicit-def: $sgpr12
-; GFX9-NEXT: ; implicit-def: $sgpr99
-; GFX9-NEXT: ; implicit-def: $sgpr68
+; GFX9-NEXT: ; implicit-def: $sgpr64
; GFX9-NEXT: ; implicit-def: $sgpr51
+; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr97
; GFX9-NEXT: ; implicit-def: $sgpr66
+; GFX9-NEXT: ; implicit-def: $sgpr67
+; GFX9-NEXT: ; implicit-def: $sgpr52
; GFX9-NEXT: ; implicit-def: $sgpr13
-; GFX9-NEXT: ; implicit-def: $sgpr64
-; GFX9-NEXT: ; implicit-def: $sgpr71
+; GFX9-NEXT: ; implicit-def: $sgpr54
+; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr69
+; GFX9-NEXT: ; implicit-def: $sgpr68
; GFX9-NEXT: ; implicit-def: $sgpr14
-; GFX9-NEXT: ; implicit-def: $sgpr67
-; GFX9-NEXT: ; implicit-def: $sgpr52
-; GFX9-NEXT: ; implicit-def: $sgpr53
+; GFX9-NEXT: ; implicit-def: $sgpr65
; GFX9-NEXT: ; implicit-def: $sgpr7
-; GFX9-NEXT: ; implicit-def: $sgpr15
; GFX9-NEXT: ; implicit-def: $sgpr70
-; GFX9-NEXT: ; implicit-def: $sgpr44
+; GFX9-NEXT: ; implicit-def: $sgpr15
+; GFX9-NEXT: ; implicit-def: $sgpr53
+; GFX9-NEXT: ; implicit-def: $sgpr46
; GFX9-NEXT: ; implicit-def: $sgpr42
; GFX9-NEXT: ; implicit-def: $sgpr40
; GFX9-NEXT: ; implicit-def: $sgpr36
@@ -174976,72 +175087,72 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v1, s77
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NEXT: v_mov_b32_e32 v46, s51
-; GFX9-NEXT: v_mov_b32_e32 v56, s50
-; GFX9-NEXT: v_mov_b32_e32 v58, s49
-; GFX9-NEXT: v_mov_b32_e32 v60, s48
-; GFX9-NEXT: v_mov_b32_e32 v27, s39
-; GFX9-NEXT: v_mov_b32_e32 v29, s38
-; GFX9-NEXT: v_mov_b32_e32 v10, s34
-; GFX9-NEXT: v_mov_b32_e32 v11, s36
+; GFX9-NEXT: v_mov_b32_e32 v24, s38
+; GFX9-NEXT: v_mov_b32_e32 v21, s51
+; GFX9-NEXT: v_mov_b32_e32 v16, s50
+; GFX9-NEXT: v_mov_b32_e32 v10, s49
+; GFX9-NEXT: v_mov_b32_e32 v4, s48
+; GFX9-NEXT: v_mov_b32_e32 v2, s39
+; GFX9-NEXT: v_mov_b32_e32 v17, s34
+; GFX9-NEXT: v_mov_b32_e32 v18, s36
; GFX9-NEXT: v_readlane_b32 s34, v62, 8
; GFX9-NEXT: v_readlane_b32 s36, v62, 6
; GFX9-NEXT: v_readlane_b32 s38, v62, 4
; GFX9-NEXT: v_readlane_b32 s48, v62, 2
; GFX9-NEXT: v_readlane_b32 s50, v62, 0
-; GFX9-NEXT: v_mov_b32_e32 v42, s46
-; GFX9-NEXT: v_mov_b32_e32 v41, s47
-; GFX9-NEXT: v_mov_b32_e32 v55, s15
-; GFX9-NEXT: v_mov_b32_e32 v40, s56
-; GFX9-NEXT: v_mov_b32_e32 v54, s57
-; GFX9-NEXT: v_mov_b32_e32 v52, s14
-; GFX9-NEXT: v_mov_b32_e32 v53, s58
-; GFX9-NEXT: v_mov_b32_e32 v51, s59
-; GFX9-NEXT: v_mov_b32_e32 v49, s13
-; GFX9-NEXT: v_mov_b32_e32 v50, s60
-; GFX9-NEXT: v_mov_b32_e32 v48, s61
-; GFX9-NEXT: v_mov_b32_e32 v38, s12
-; GFX9-NEXT: v_mov_b32_e32 v39, s62
-; GFX9-NEXT: v_mov_b32_e32 v37, s63
-; GFX9-NEXT: v_mov_b32_e32 v35, s11
-; GFX9-NEXT: v_mov_b32_e32 v36, s72
-; GFX9-NEXT: v_mov_b32_e32 v34, s73
-; GFX9-NEXT: v_mov_b32_e32 v32, s10
-; GFX9-NEXT: v_mov_b32_e32 v33, s74
-; GFX9-NEXT: v_mov_b32_e32 v31, s75
+; GFX9-NEXT: v_mov_b32_e32 v43, s44
+; GFX9-NEXT: v_mov_b32_e32 v42, s45
+; GFX9-NEXT: v_mov_b32_e32 v40, s15
+; GFX9-NEXT: v_mov_b32_e32 v41, s56
+; GFX9-NEXT: v_mov_b32_e32 v55, s57
+; GFX9-NEXT: v_mov_b32_e32 v53, s14
+; GFX9-NEXT: v_mov_b32_e32 v54, s58
+; GFX9-NEXT: v_mov_b32_e32 v52, s59
+; GFX9-NEXT: v_mov_b32_e32 v50, s13
+; GFX9-NEXT: v_mov_b32_e32 v51, s60
+; GFX9-NEXT: v_mov_b32_e32 v49, s61
+; GFX9-NEXT: v_mov_b32_e32 v39, s12
+; GFX9-NEXT: v_mov_b32_e32 v48, s62
+; GFX9-NEXT: v_mov_b32_e32 v38, s63
+; GFX9-NEXT: v_mov_b32_e32 v36, s11
+; GFX9-NEXT: v_mov_b32_e32 v37, s72
+; GFX9-NEXT: v_mov_b32_e32 v35, s73
+; GFX9-NEXT: v_mov_b32_e32 v33, s10
+; GFX9-NEXT: v_mov_b32_e32 v34, s74
+; GFX9-NEXT: v_mov_b32_e32 v32, s75
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT: v_mov_b32_e32 v26, s53
-; GFX9-NEXT: v_mov_b32_e32 v25, s52
-; GFX9-NEXT: v_mov_b32_e32 v6, s70
-; GFX9-NEXT: v_mov_b32_e32 v12, s7
-; GFX9-NEXT: v_mov_b32_e32 v44, s6
-; GFX9-NEXT: v_mov_b32_e32 v23, s71
-; GFX9-NEXT: v_mov_b32_e32 v43, s67
-; GFX9-NEXT: v_mov_b32_e32 v24, s69
-; GFX9-NEXT: v_mov_b32_e32 v21, s68
-; GFX9-NEXT: v_mov_b32_e32 v45, s64
+; GFX9-NEXT: v_mov_b32_e32 v19, s7
+; GFX9-NEXT: v_mov_b32_e32 v26, s6
+; GFX9-NEXT: v_mov_b32_e32 v8, s53
+; GFX9-NEXT: v_mov_b32_e32 v14, s70
+; GFX9-NEXT: v_mov_b32_e32 v44, s69
+; GFX9-NEXT: v_mov_b32_e32 v27, s65
+; GFX9-NEXT: v_mov_b32_e32 v25, s68
+; GFX9-NEXT: v_mov_b32_e32 v46, s67
; GFX9-NEXT: v_mov_b32_e32 v22, s66
-; GFX9-NEXT: v_mov_b32_e32 v19, s65
-; GFX9-NEXT: v_mov_b32_e32 v47, s99
+; GFX9-NEXT: v_mov_b32_e32 v45, s54
+; GFX9-NEXT: v_mov_b32_e32 v23, s52
+; GFX9-NEXT: v_mov_b32_e32 v56, s64
; GFX9-NEXT: v_mov_b32_e32 v20, s55
-; GFX9-NEXT: v_mov_b32_e32 v17, s54
-; GFX9-NEXT: v_mov_b32_e32 v57, s96
-; GFX9-NEXT: v_mov_b32_e32 v18, s98
-; GFX9-NEXT: v_mov_b32_e32 v15, s97
-; GFX9-NEXT: v_mov_b32_e32 v59, s85
-; GFX9-NEXT: v_mov_b32_e32 v16, s87
-; GFX9-NEXT: v_mov_b32_e32 v13, s86
-; GFX9-NEXT: v_mov_b32_e32 v61, s82
-; GFX9-NEXT: v_mov_b32_e32 v14, s84
-; GFX9-NEXT: v_mov_b32_e32 v7, s83
-; GFX9-NEXT: v_mov_b32_e32 v28, s80
-; GFX9-NEXT: v_mov_b32_e32 v8, s81
-; GFX9-NEXT: v_mov_b32_e32 v1, s78
-; GFX9-NEXT: v_mov_b32_e32 v2, s88
-; GFX9-NEXT: v_mov_b32_e32 v3, s90
-; GFX9-NEXT: v_mov_b32_e32 v4, s92
-; GFX9-NEXT: v_mov_b32_e32 v5, s94
-; GFX9-NEXT: v_mov_b32_e32 v9, s30
+; GFX9-NEXT: v_mov_b32_e32 v47, s97
+; GFX9-NEXT: v_mov_b32_e32 v58, s99
+; GFX9-NEXT: v_mov_b32_e32 v15, s98
+; GFX9-NEXT: v_mov_b32_e32 v57, s86
+; GFX9-NEXT: v_mov_b32_e32 v60, s96
+; GFX9-NEXT: v_mov_b32_e32 v9, s87
+; GFX9-NEXT: v_mov_b32_e32 v59, s83
+; GFX9-NEXT: v_mov_b32_e32 v28, s85
+; GFX9-NEXT: v_mov_b32_e32 v3, s84
+; GFX9-NEXT: v_mov_b32_e32 v61, s80
+; GFX9-NEXT: v_mov_b32_e32 v30, s82
+; GFX9-NEXT: v_mov_b32_e32 v1, s81
+; GFX9-NEXT: v_mov_b32_e32 v29, s71
+; GFX9-NEXT: v_mov_b32_e32 v5, s78
+; GFX9-NEXT: v_mov_b32_e32 v6, s88
+; GFX9-NEXT: v_mov_b32_e32 v7, s90
+; GFX9-NEXT: v_mov_b32_e32 v11, s92
+; GFX9-NEXT: v_mov_b32_e32 v12, s94
+; GFX9-NEXT: v_mov_b32_e32 v13, s30
; GFX9-NEXT: v_readlane_b32 s11, v62, 10
; GFX9-NEXT: v_readlane_b32 s12, v62, 11
; GFX9-NEXT: v_readlane_b32 s13, v62, 12
@@ -175054,7 +175165,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_readlane_b32 s10, v62, 19
; GFX9-NEXT: v_readlane_b32 s41, v62, 20
; GFX9-NEXT: v_readlane_b32 s43, v62, 21
-; GFX9-NEXT: v_readlane_b32 s45, v62, 22
+; GFX9-NEXT: v_readlane_b32 s47, v62, 22
; GFX9-NEXT: v_readlane_b32 s75, v62, 23
; GFX9-NEXT: v_readlane_b32 s79, v62, 24
; GFX9-NEXT: v_readlane_b32 s74, v62, 25
@@ -175079,7 +175190,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 44
; GFX9-NEXT: v_readlane_b32 s56, v62, 45
; GFX9-NEXT: v_readlane_b32 s30, v62, 46
-; GFX9-NEXT: v_readlane_b32 s47, v62, 47
+; GFX9-NEXT: v_readlane_b32 s45, v62, 47
; GFX9-NEXT: v_readlane_b32 s8, v62, 48
; GFX9-NEXT: v_readlane_b32 s7, v62, 49
; GFX9-NEXT: v_readlane_b32 s35, v62, 9
@@ -175092,14 +175203,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_lshl_b32 s7, s7, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s7, s8, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s44, 8
+; GFX9-NEXT: s_lshl_b32 s8, s46, 8
; GFX9-NEXT: s_or_b32 s7, s7, s8
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s17, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s47, 8
+; GFX9-NEXT: s_lshl_b32 s7, s45, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s7, s11, 0xff
; GFX9-NEXT: s_lshl_b32 s8, s30, 8
@@ -175107,8 +175218,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s18, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s56, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175118,8 +175229,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s19, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s57, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175129,8 +175240,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s20, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s58, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175140,8 +175251,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s21, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s59, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175151,8 +175262,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s22, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s60, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175162,8 +175273,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s23, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s61, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175173,8 +175284,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s24, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s62, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175184,8 +175295,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s25, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s63, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175195,8 +175306,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s26, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s72, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175206,8 +175317,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s27, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s73, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175217,8 +175328,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s28, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s74, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
@@ -175228,19 +175339,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s6, s29, 0xff
; GFX9-NEXT: s_lshl_b32 s7, s75, 8
; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s7, s77, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s45, 8
+; GFX9-NEXT: s_lshl_b32 s8, s47, 8
; GFX9-NEXT: s_or_b32 s7, s7, s8
; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
; GFX9-NEXT: s_lshl_b32 s7, s7, 16
; GFX9-NEXT: s_or_b32 s6, s6, s7
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: v_mov_b32_e32 v30, s6
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT: v_mov_b32_e32 v31, s6
; GFX9-NEXT: s_and_b32 s4, s4, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s43, 8
; GFX9-NEXT: s_or_b32 s4, s4, s6
@@ -175250,8 +175361,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s4, s4, s6
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: v_mov_b32_e32 v30, s4
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT: v_mov_b32_e32 v31, s4
; GFX9-NEXT: s_and_b32 s4, s5, 0xff
; GFX9-NEXT: s_lshl_b32 s5, s10, 8
; GFX9-NEXT: s_or_b32 s4, s4, s5
@@ -175261,23 +175372,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_lshl_b32 s5, s5, 16
; GFX9-NEXT: s_or_b32 s4, s4, s5
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: v_mov_b32_e32 v30, s4
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX9-NEXT: v_or_b32_sdwa v5, v58, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX9-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: v_mov_b32_e32 v31, s4
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18
+; GFX9-NEXT: v_or_b32_sdwa v18, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readlane_b32 s99, v63, 35
; GFX9-NEXT: v_readlane_b32 s98, v63, 34
; GFX9-NEXT: v_readlane_b32 s97, v63, 33
@@ -175315,93 +175416,103 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: v_readlane_b32 s31, v63, 1
; GFX9-NEXT: v_readlane_b32 s30, v63, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v8
-; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v28
+; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v10
-; GFX9-NEXT: v_or_b32_sdwa v7, v33, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v8, v27, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v61
-; GFX9-NEXT: v_or_b32_sdwa v7, v31, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17
+; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v61
+; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v8, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v9
-; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v16
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v59
-; GFX9-NEXT: v_or_b32_sdwa v7, v34, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v8, v32, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17
-; GFX9-NEXT: v_or_b32_sdwa v7, v39, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v57
-; GFX9-NEXT: v_or_b32_sdwa v5, v37, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v19
-; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v47
-; GFX9-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v5, v38, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21
-; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v45
-; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23
-; GFX9-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v43
-; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v25
-; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v13
+; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v59
+; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v12
+; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v57
+; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v11
+; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v47
+; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v7
+; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v45
+; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v27
+; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -175444,26 +175555,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s97, 1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v5
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v5
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2
; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s98, 2
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v6
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v6
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v8
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3
; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s99, 3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v9
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v10
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v11
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4
; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s100, 4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v13
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v14
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5
; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s101, 5
; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6
@@ -175497,333 +175608,332 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s87, 31
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[24:25], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 6
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s1, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s0, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s41, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s41, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s41, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s40, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s40, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s43, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s43, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s43, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s42, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s42, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s5, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s5, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s5, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s26, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s7, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s25, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s7, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s7, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s6, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s6, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s9, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s9, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s9, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s8, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s8, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s45, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s45, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 31
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s46, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s45, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s45, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s44, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s44, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s59, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s14, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s59, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 30
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s47, 9
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[22:23], 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s44, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 29
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s44, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s59, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s59, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 28
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s59, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s15, 9
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s58, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s58, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s61, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s61, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s61, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s60, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s60, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s63, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s63, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s63, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s58, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s58, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 27
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s63, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s63, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s63, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s62, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s73, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s73, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s73, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s73, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s72, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s72, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s29, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 23
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s73, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s72, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s72, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 22
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s29, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s29, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s29, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 21
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s28, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[100:101], s[26:27], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[2:3], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[40:41], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 8
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[42:43], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[44:45], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 20
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 17
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s20, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[44:45], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[58:59], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 15
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 14
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 13
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 10
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 9
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s17, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s17, 16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 6
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 6
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s16, 16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 4
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s16, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 24
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s62, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 25
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 26
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[20:21], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s62, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s46, 25
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s47, 26
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[20:21], 24
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 19
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[18:19], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 12
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s46, 18
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s47, 19
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[18:19], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s46, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s47, 12
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB91_4
; GFX11-TRUE16-NEXT: .LBB91_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX11-TRUE16-NEXT: s_and_b32 s14, s58, 0xffff0000
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s58, 16
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s29, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s58, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s43, s58, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 16
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s41, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s11
+; GFX11-TRUE16-NEXT: s_and_b32 s13, s6, 0xffff0000
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s6, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s41, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s77, s28, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s47, s59, 0xffff0000
; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s58, 0x10010
-; GFX11-TRUE16-NEXT: s_lshl_b32 s78, s28, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s6, s58
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s73, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s76, s73, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s74, s72, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s75, s72, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s59, 16
+; GFX11-TRUE16-NEXT: s_add_i32 s59, s6, s58
+; GFX11-TRUE16-NEXT: s_and_b32 s75, s28, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s76, s28, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s74, s73, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s73, s73, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s61, s72, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s72, 16
; GFX11-TRUE16-NEXT: s_and_b32 s12, s63, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s73, s63, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s63, s62, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s62, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s62, s61, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s61, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s47, s60, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s60, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s46, s59, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s59, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s45, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s45, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s45, s44, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s44, s44, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s29, s43, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s43, s43, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s13, s42, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s42, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s40, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s40, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s60, s63, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s56, s62, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s62, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s41, s45, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s45, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s14, s44, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s44, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s9, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s44, s8, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s8, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s29, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s40, s7, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s5, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s4, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s59, 0x7fff
; GFX11-TRUE16-NEXT: s_bitset1_b32 s58, 22
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s58, s41
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s58, s59
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s40, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s4, 16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s77
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s75
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s78
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s76
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s1, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s58, s1, 0x10010
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s1
+; GFX11-TRUE16-NEXT: s_add_i32 s58, s58, s1
; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_addk_i32 s58, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s58
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v4, v6
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
; GFX11-TRUE16-NEXT: v_add_f32_e64 v51, 0x40c00000, s6
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s58, s4, 0x10010
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_add_i32 s58, s58, s4
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s58, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s59, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s58
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s73
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s74
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s0, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s58, s0, 0x10010
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v4
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s5, s0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s40, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_add_i32 s58, s58, s0
; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_addk_i32 s58, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s59, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s58
+; GFX11-TRUE16-NEXT: s_and_b32 s58, s3, 0xffff0000
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s58
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v6
; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v9
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v1, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v3
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s59, s58, 0x10010
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_add_i32 s59, s59, s58
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s58, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s62, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s58, s58, s59
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v10
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s75
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s40, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s72
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s58, 16
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v10
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s74
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s3, 0x10010
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s61
+; GFX11-TRUE16-NEXT: s_bfe_u32 s59, s3, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s3
+; GFX11-TRUE16-NEXT: s_add_i32 s59, s59, s3
; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_addk_i32 s59, 0x7fff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v9
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s58, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s59
+; GFX11-TRUE16-NEXT: s_and_b32 s58, s2, 0xffff0000
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s58
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v3
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s59, s58, 0x10010
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_add_i32 s59, s59, s58
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s58, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s61, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s58, s58, s59
; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s73
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v5
; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s12
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
@@ -175833,210 +175943,214 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s2, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s12, s2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s40, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_add_i32 s59, s12, s2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s58, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s59, 0x7fff
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s58, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s59
+; GFX11-TRUE16-NEXT: s_and_b32 s58, s17, 0xffff0000
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v27
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v24, 16, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s58
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v9, 16, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s59, s58, 0x10010
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_add_i32 s59, s59, s58
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s58, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s59, 0x7fff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_and_b32 s60, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s58, s58, s59
; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s17
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s72
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s57
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v8
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s63
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s56
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s17, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s56, s17, 0x10010
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s17
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s40, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_add_i32 s56, s56, s17
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s58, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s56, 0x7fff
; GFX11-TRUE16-NEXT: s_bitset1_b32 s17, 22
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s57, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s56
+; GFX11-TRUE16-NEXT: s_and_b32 s56, s16, 0xffff0000
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v26
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v6
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s56
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v10
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s56, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s47
; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s17, 16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_bfe_u32 s47, s56, 0x10010
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s62
-; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_add_i32 s47, s47, s56
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s56, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s47, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s57, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s47, s56, s47
; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s16
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v15
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s16, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s46
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s57
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s16, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s16
+; GFX11-TRUE16-NEXT: s_bfe_u32 s46, s16, 0x10010
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v29
+; GFX11-TRUE16-NEXT: s_add_i32 s56, s46, s16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s47, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s56, 0x7fff
; GFX11-TRUE16-NEXT: s_bitset1_b32 s16, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s56
+; GFX11-TRUE16-NEXT: s_and_b32 s47, s19, 0xffff0000
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s47
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v6, 16, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s47, v12
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v6, 16, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s47
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v8
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s47, 0x10010
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s43
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s47
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s47, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s47, s42
; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 16
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s56
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v8
; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s19
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v11
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s42, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v11
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v12
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s40, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s19, 0x10010
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s19, 0x10010
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s19
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v10
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
; GFX11-TRUE16-NEXT: s_bitset1_b32 s19, 22
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s41
-; GFX11-TRUE16-NEXT: s_and_b32 s40, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s18, 0xffff0000
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s19, 16
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s15
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s41
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s46
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s15
-; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s41, 0x10010
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s41
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s41, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s28
-; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s41, s41, s42
; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 16
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s18
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s40, 16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v28, 16, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v5, 16, v10
-; GFX11-TRUE16-NEXT: s_bfe_u32 s28, s18, 0x10010
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, s18
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s28
-; GFX11-TRUE16-NEXT: s_and_b32 s28, s21, 0xffff0000
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s28
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_bfe_u32 s28, s18, 0x10010
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s28, s18
+; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s41, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22
+; GFX11-TRUE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s41, s21, 0xffff0000
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s41
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v11, 16, 1
; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s18, 16
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v11
-; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s15
; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s21, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s15
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s14, 16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s14, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v17
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v13
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-TRUE16-NEXT: s_bfe_u32 s21, s15, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, s15
@@ -176049,20 +176163,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10
; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s15
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12
; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s14, 16
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s11
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s2, s12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s11, 0x10010
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s9
; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s11
; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
@@ -176072,35 +176186,35 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v33
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v10
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s9, 0x10010
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v6, 16, v11
-; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s14, 0x10010
+; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s9
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-TRUE16-NEXT: s_add_i32 s15, s9, s14
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s11, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s9, s14
; GFX11-TRUE16-NEXT: s_and_b32 s14, s23, 0xffff0000
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s11, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s9, 16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s45
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s44
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s45
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -176109,14 +176223,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s14, s15
; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s23, 16
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v10
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s11, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s9, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v11
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -176128,12 +176242,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s14, s15
; GFX11-TRUE16-NEXT: s_and_b32 s14, s22, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s11, 16
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s43
+; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s9, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s40
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s14
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
@@ -176150,15 +176264,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s14, s15
; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s14
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v39
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v10
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s9, 16
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9
@@ -176169,34 +176283,34 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s14
; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s13
-; GFX11-TRUE16-NEXT: s_and_b32 s14, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s14, s13
+; GFX11-TRUE16-NEXT: s_and_b32 s13, s25, 0xffff0000
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s13
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s10
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s13, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s9, 16
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v8, 16, 1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s10, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s13, s10, 0x10010
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s10
+; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s10
; GFX11-TRUE16-NEXT: s_bitset1_b32 s10, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff
+; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s10, s10, s14
-; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s25, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s10, s13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 16
; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s8
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s10, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5
@@ -176204,19 +176318,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
-; GFX11-TRUE16-NEXT: s_bfe_u32 s13, s8, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s8, 0x10010
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v11, 16, 1
-; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s8
+; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s8
; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s13
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xffff0000
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v48, v11
@@ -176226,39 +176340,39 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37
-; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s7, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s7, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v7
-; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s7
+; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s7
; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff
; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s7, s10
+; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s7, s9
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s24, 16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s5
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s7, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v51, 16, 1
-; GFX11-TRUE16-NEXT: s_bfe_u32 s4, s8, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s8, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v6
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, s8
+; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s8
; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff
; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s8, s4
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s8, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s27, 0xffff0000
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
; GFX11-TRUE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v51
-; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s5, 16
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v52
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
@@ -176269,8 +176383,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6
; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s6, s7
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v8, v11
@@ -176279,7 +176393,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s5, 16
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v66, v5, 16, v13
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
@@ -176294,52 +176408,56 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s6, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s4, 16
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v51
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v55, v48, 16, v8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s29
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s63
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v54, v6, 16, v10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s3, s59
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s17, s60
-; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s6, 0x10010
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s16, s42
-; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s6, s5
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 16
-; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s27, s73
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s3, s62
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s21, s72
+; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s20, s11
+; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s8
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s8, s9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s18, s28
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s40, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s16, s46
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s26, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[54:55]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[66:67]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[70:71]
-; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s12, 0x10010
+; GFX11-TRUE16-NEXT: s_bfe_u32 s28, s26, 0x10010
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19]
-; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, s12
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s12, 22
-; GFX11-TRUE16-NEXT: s_addk_i32 s11, 0x7fff
-; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s12, s11
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, s26
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s26, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s26, s26, s28
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[14:15]
-; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s26, 16
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[1:2]
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s26, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s25, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s27, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s26, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s24, s42
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 24, v55
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v55
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54
@@ -176356,116 +176474,107 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v17
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s72
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[44:45], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[42:43], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[40:41], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[28:29], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[36:37], s[10:11], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[38:39], s[4:5], 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s45, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s45, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s44, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s44, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 24
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s19, s59
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[40:41], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[28:29], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[10:11], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[8:9], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[6:7], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[4:5], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[12:13], 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s41, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s41, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s40, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s40, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s29, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s29, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s28, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s11, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s10, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s10, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s10, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s9, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s15, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s15, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s14, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s14, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s13, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s12, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s12, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s9, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s8, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s8, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s7, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s7, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s6, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s6, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s5, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s5, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s4, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s4, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s43, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s43, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s42, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s42, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s41, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s41, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s40, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s40, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s7, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s7, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s6, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s5, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s5, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s4, 8
; GFX11-TRUE16-NEXT: s_branch .LBB91_5
; GFX11-TRUE16-NEXT: .LBB91_3:
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16
@@ -176476,198 +176585,198 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 1
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 3
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 4
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 5
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 6
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 7
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 12
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 19
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 25
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 26
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 1
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 2
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s11, 12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 17
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 18
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s11, 19
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 20
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 21
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 22
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 23
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 24
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 25
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s11, 26
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 27
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 28
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 29
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 30
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s10, 31
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 0
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 1
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 2
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s10, 7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s74, 8
; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s75, 9
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: s_branch .LBB91_2
; GFX11-TRUE16-NEXT: .LBB91_4:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s90
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s92
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s94
+; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v43, 25
+; GFX11-TRUE16-NEXT: v_readlane_b32 s92, v43, 18
+; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v42, 8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s30
-; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 25
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s37
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s39
-; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 26
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 19
-; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v42, 8
-; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v43, 11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s40
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s41
+; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v43, 26
+; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v43, 19
+; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v42, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s87
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s42
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s96
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s44
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s58
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s60
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s61
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s58
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s59
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s40
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s62
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s63
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s72
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s73
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s97
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s29
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s98
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s69
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s70
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s68
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s69
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s71
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s48
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s80
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s81
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s82
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s49
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s51
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s70
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s39
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s71
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s80
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s37
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s81
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s64
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s83
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s84
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s66
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s65
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s85
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s52
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s67
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s66
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s52
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s13
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s86
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s68
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s67
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s74
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s76
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s78
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s88
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s90
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s92
-; GFX11-TRUE16-NEXT: v_readlane_b32 s58, v43, 0
-; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 2
-; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 7
-; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 14
-; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 21
-; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 28
-; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v42, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s58, v43, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 14
+; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v43, 21
+; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v43, 28
+; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s73, v42, 6
-; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v42, 7
-; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v42, 5
-; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 3
-; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v42, 2
-; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v42, 0
-; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v43, 31
-; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v42, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v42, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v42, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s40, v42, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v42, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s29, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 31
+; GFX11-TRUE16-NEXT: v_readlane_b32 s28, v43, 30
; GFX11-TRUE16-NEXT: v_readlane_b32 s74, v43, 29
-; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 27
+; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v43, 27
; GFX11-TRUE16-NEXT: v_readlane_b32 s75, v43, 24
-; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 23
+; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 23
; GFX11-TRUE16-NEXT: v_readlane_b32 s76, v43, 22
-; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 20
-; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 17
-; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 16
-; GFX11-TRUE16-NEXT: v_readlane_b32 s88, v43, 15
-; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v43, 13
-; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v43, 10
-; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v43, 9
-; GFX11-TRUE16-NEXT: s_mov_b32 s92, s100
-; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v43, 8
-; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v43, 6
-; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 5
-; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v42, 9
-; GFX11-TRUE16-NEXT: v_readlane_b32 s42, v43, 4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v43, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v43, 12
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v43, 20
+; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 17
+; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v43, 16
+; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 15
+; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v43, 13
+; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 10
+; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v43, 9
+; GFX11-TRUE16-NEXT: s_mov_b32 s88, s100
+; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v43, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v43, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v43, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 12
+; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 3
; GFX11-TRUE16-NEXT: .LBB91_5: ; %end
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s34
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s104
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s56
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s60
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff
@@ -176675,7 +176784,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s103
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s58
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s63
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s102
; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
@@ -176689,18 +176798,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4
; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s12
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s42
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s99
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s46
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s56
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s59
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, s31
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s93
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s95
; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff
@@ -176716,9 +176825,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, s42
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s95
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, s8
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s91
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s46
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
@@ -176726,9 +176835,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s60
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s93
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s9
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s58
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
@@ -176742,9 +176851,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s91
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s90
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s38
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s12
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s79
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s30
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
@@ -176752,9 +176861,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s19
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s61
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s13
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s59
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s78
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff
@@ -176770,9 +176879,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, s79
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s78
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s30
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, s14
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s77
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s92
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
@@ -176780,8 +176889,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s77
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s15
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s72
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s76
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
@@ -176796,9 +176905,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s75
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s94
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s90
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
@@ -176806,8 +176915,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s23
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s63
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s44
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s74
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
@@ -176827,9 +176936,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, s10
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, s57
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s36
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s61
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s94
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
@@ -176837,9 +176946,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s72
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s47
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s45
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s57
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
@@ -176853,9 +176962,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, s44
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s45
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s47
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s88
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
@@ -176863,13 +176972,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s15
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41
; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-TRUE16-NEXT: s_mov_b32 s6, s73
; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xff
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, s13
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s43
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
@@ -177110,20 +177219,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v8
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3
; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s99, 3
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v9
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v10
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4
; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s100, 4
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v12
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v14
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5
; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s101, 5
; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6
; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s102, 6
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7
@@ -177155,522 +177264,511 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s87, 31
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 15
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s2, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s1, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 14
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s0, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[22:23], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s45, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s45, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 17
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s45, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s44, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s44, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 18
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s47, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s47, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s47, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 19
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s46, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s57, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s57, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 13
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s57, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s56, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s56, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 20
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s59, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s59, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s59, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 21
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s5, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s5, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s4, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s7, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s25, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s9, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s9, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s8, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s59, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s59, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s59, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s58, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s58, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s61, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 22
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s61, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s61, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s60, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 23
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s63, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s58, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 19
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s61, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s61, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s61, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s60, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s63, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 21
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s63, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 12
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s63, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s62, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s63, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s62, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s73, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s73, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s73, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 22
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s73, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 25
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s72, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s72, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 23
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s29, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 26
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s29, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 27
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 8
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 11
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[44:45], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[46:47], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 28
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[56:57], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[100:101], s[26:27], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 25
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[102:103], s[24:25], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 26
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s20, 16
; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 29
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 27
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s20, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 30
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 24
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 31
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 10
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 29
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 30
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 31
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s17, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s17, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 9
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s16, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 6
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s16, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 24
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s46, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 7
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 5
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 3
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 1
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s2, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s42, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s2, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s43, 5
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[20:21], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s42, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s10, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s6, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s43, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s46, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s47, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[2:3], 24
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB91_4
; GFX11-FAKE16-NEXT: .LBB91_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX11-FAKE16-NEXT: s_and_b32 s14, s47, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s47, 16
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s29, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s13, s9, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s9, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s76, s28, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s28, 16
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s45, 0xffff0000
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v6
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s11
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s45, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s78, s28, 0xffff0000
-; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s47, 0x10010
-; GFX11-FAKE16-NEXT: s_lshl_b32 s79, s28, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s6, s47
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s73, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s75, s72, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s76, s72, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s74, s63, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s72, s62, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s73, s62, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s63, s61, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s62, s61, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s61, s60, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s60, s60, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s12, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s6, 16
+; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s58, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-FAKE16-NEXT: s_and_b32 s41, s59, 0xffff0000
; GFX11-FAKE16-NEXT: s_lshl_b32 s40, s59, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s13, s57, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s57, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s42, s56, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s56, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s12, s46, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s46, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s44, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s44, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22
+; GFX11-FAKE16-NEXT: s_add_i32 s59, s6, s58
+; GFX11-FAKE16-NEXT: s_and_b32 s74, s73, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s75, s73, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s73, s72, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s72, s72, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s63, s63, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s56, s62, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s57, s62, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s47, s61, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s61, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s60, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s46, s60, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s8, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s8, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s5, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s4, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s58, 22
; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s45
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s58, s59
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s44, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s4, 16
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s78
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s76
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s79
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s77
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s1, 0x10010
+; GFX11-FAKE16-NEXT: s_bfe_u32 s58, s1, 0x10010
; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s1
+; GFX11-FAKE16-NEXT: s_add_i32 s58, s58, s1
; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s45
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s0, 0xffff0000
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: s_addk_i32 s58, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s58
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v6
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: s_bfe_u32 s58, s4, 0x10010
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_add_i32 s58, s58, s4
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s58, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s59, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v7
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s58
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s75
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s74
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s77
-; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v4
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s5, s0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s44, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s58, s0, 0x10010
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v23, 16, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s58, s58, s0
; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s45
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s3, 0xffff0000
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s44
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_addk_i32 s58, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s59, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s58
+; GFX11-FAKE16-NEXT: s_and_b32 s58, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s58
; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v6
; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v9
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v1, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v7, 16, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s59, s58, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_add_i32 s59, s59, s58
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s58, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s60, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s58, s58, s59
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v10
; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s76
-; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s75
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s72
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s58, 16
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v6
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s3, 0x10010
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s73
+; GFX11-FAKE16-NEXT: s_bfe_u32 s59, s3, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s3
+; GFX11-FAKE16-NEXT: s_add_i32 s59, s59, s3
; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s45
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s2, 0xffff0000
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s44
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v9
+; GFX11-FAKE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s58, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s59
+; GFX11-FAKE16-NEXT: s_and_b32 s58, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s58
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v3
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v24
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s74
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v5
-; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s59, s58, 0x10010
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: s_add_i32 s59, s59, s58
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s58, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s60, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s58, s58, s59
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v14
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s11
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s2, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s11, s2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s44, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s59, s11, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s58, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s59, 0x7fff
; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s45
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s17, 0xffff0000
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v26
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s44
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s58, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s59
+; GFX11-FAKE16-NEXT: s_and_b32 s58, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s58
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v4, 16, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v13
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_bfe_u32 s59, s58, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_add_i32 s59, s59, s58
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s58, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s59, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s60, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s58, s58, s59
; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s73
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s17
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s72
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v4
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s44, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s17, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s57
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v27
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s17
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s56
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s56, s17, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s56, s56, s17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s58, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s56, 0x7fff
; GFX11-FAKE16-NEXT: s_bitset1_b32 s17, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v28, 16, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s45
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s57, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s56
+; GFX11-FAKE16-NEXT: s_and_b32 s56, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v10, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s56
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v29, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v10
; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s17, 16
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v5, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v29
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s44
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v8
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s47
+; GFX11-FAKE16-NEXT: s_bfe_u32 s47, s56, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_add_i32 s47, s47, s56
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s56, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s47, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s57, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s47, s56, s47
; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s44, 16
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v8
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s62
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s16
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s45
-; GFX11-FAKE16-NEXT: s_and_b32 s44, s19, 0xffff0000
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v5
-; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v7, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s45
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s61
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s56, s45, s16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s47, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s56, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s56
+; GFX11-FAKE16-NEXT: s_and_b32 s47, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s47
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v3, 16, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s46
+; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s47, 0x10010
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v6
+; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s47
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s44
; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s19
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s44, 16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v10
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v9, v8
-; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s19, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s46, s19, 0x10010
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v11, v10
+; GFX11-FAKE16-NEXT: s_add_i32 s46, s46, s19
; GFX11-FAKE16-NEXT: s_bitset1_b32 s19, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_addk_i32 s46, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s45
+; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s46
; GFX11-FAKE16-NEXT: s_and_b32 s44, s18, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s44
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v5, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s19, 16
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s29
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s41
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v4
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s47, s17, s72
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s41, 0x10010
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s41
; GFX11-FAKE16-NEXT: s_bitset1_b32 s41, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s40
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s41, s41, s44
; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v31
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v30, 16, v4
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v3, 16, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
; GFX11-FAKE16-NEXT: s_bfe_u32 s40, s18, 0x10010
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s28
; GFX11-FAKE16-NEXT: s_add_i32 s44, s40, s18
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s41, 16
; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
@@ -177678,105 +177776,105 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s18, s18, s44
; GFX11-FAKE16-NEXT: s_and_b32 s41, s21, 0xffff0000
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v9, 16, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v9
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v9
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s41
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v14, 16, 1
; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s28, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3
; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s28
; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s28, s28, s29
; GFX11-FAKE16-NEXT: s_lshl_b32 s21, s21, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s21
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s2, s11
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s21
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s28, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v19
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v6, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s21, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s21
; GFX11-FAKE16-NEXT: s_bitset1_b32 s21, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s21, s21, s29
; GFX11-FAKE16-NEXT: s_and_b32 s28, s20, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s28
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v10
-; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s3, s59
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s46, s16, s46
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s28
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s13, 0x10010
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v35
; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s13
; GFX11-FAKE16-NEXT: s_bitset1_b32 s13, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s13, s28
; GFX11-FAKE16-NEXT: s_lshl_b32 s20, s20, 16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s20
-; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v35
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v2, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v4, 16, v11
; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s20, 0x10010
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
; GFX11-FAKE16-NEXT: s_add_i32 s28, s10, s20
; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s13, 16
; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
; GFX11-FAKE16-NEXT: s_bitset1_b32 s20, 22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s20, s28
; GFX11-FAKE16-NEXT: s_and_b32 s20, s23, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s20
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s43
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s42
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s20
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s43
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-FAKE16-NEXT: s_bfe_u32 s20, s28, 0x10010
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v10, 16, 1
; GFX11-FAKE16-NEXT: s_add_i32 s29, s20, s28
; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s13, 16
; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
@@ -177784,235 +177882,241 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s28, s29
; GFX11-FAKE16-NEXT: s_lshl_b32 s23, s23, 16
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s23
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s13, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v9
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v2
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v6, v10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s13, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s23, 0x10010
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s23
; GFX11-FAKE16-NEXT: s_bitset1_b32 s23, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s23, s28
; GFX11-FAKE16-NEXT: s_and_b32 s23, s22, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v36
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s23
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s14
; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s13, 16
-; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v37, 16, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v38, 16, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s12
; GFX11-FAKE16-NEXT: s_bfe_u32 s15, s14, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v3
; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, s14
; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s15, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s14, s15
; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s22, 16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v38
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 16
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v10
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v2, 16, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s13, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v11
; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s14, 0x10010
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v10
; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s14
; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s14, s12
-; GFX11-FAKE16-NEXT: s_and_b32 s14, s25, 0xffff0000
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s9
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-FAKE16-NEXT: s_bfe_u32 s14, s9, 0x10010
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_and_b32 s13, s25, 0xffff0000
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, s9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: s_bfe_u32 s13, s9, 0x10010
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, s9
; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s14, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: s_addk_i32 s13, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3
; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s14
+; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s13
; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s25, 16
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s8
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s9, 16
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s9, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v10
; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s8, 0x10010
-; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1
; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s8
; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s12
; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s8, 16
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v12, v9
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s6
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v16, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s7, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s7
; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s9
; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s24, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s8
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v10
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1
-; GFX11-FAKE16-NEXT: s_bfe_u32 s4, s8, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v10, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s8, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s8
; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff
; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4
+; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s8, s5
; GFX11-FAKE16-NEXT: s_and_b32 s6, s27, 0xffff0000
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v12
-; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s4, 16
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v14, v16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s5, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v11, 16, 1
; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v50
; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6
; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s6, s7
; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v4, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v12
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s6
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s4, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v49
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v8
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v51
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v1, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s5, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v3, 16, v15
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v52
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v48
; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6
; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s4, 16
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v50, 16, v4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v53
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v51, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v49, 16, v7
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v4, 16, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s18, s40
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13]
+; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[8:9]
+; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s8
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s29, s8, s9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[18:19]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[1:2]
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s61
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s3, s62
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s26, v3
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v2, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v48, 16, v5
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18]
-; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s6, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16]
-; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s14, s6, s5
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s26, 16
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s20, s10
-; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s14, 16
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71]
-; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s11, 0x10010
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20]
-; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s11
-; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff
-; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s19, s60
-; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s10, 16
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s23, s62
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v55
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v55
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[54:55]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[66:67]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[70:71]
+; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s26, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[20:21]
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s26
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s26, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s26, s26, s28
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s26, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s19, s59
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s21, s60
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s20, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s63
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s25, s72
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s41, s27, s73
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s40, s26, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s24, s42
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 8, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v54
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67
@@ -178022,120 +178126,114 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v21
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v17
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s21, s61
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s25, s63
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s57, s27, s73
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s56, s26, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s24, s12
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[46:47], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[44:45], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[56:57], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s57, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s57, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s56, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s56, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s11, 24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s58
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s16, s45
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[40:41], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[28:29], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[14:15], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[10:11], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[12:13], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[4:5], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[8:9], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s41, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s41, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s40, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s40, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s29, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s28, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s15, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s15, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s14, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s14, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s11, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s11, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s10, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s10, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s9, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s13, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s12, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s12, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s9, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s8, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s7, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s7, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s6, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s6, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s5, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s5, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s4, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s4, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s47, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s47, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s46, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s46, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s45, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s44, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s44, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s29, 24
-; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 8
; GFX11-FAKE16-NEXT: s_branch .LBB91_5
; GFX11-FAKE16-NEXT: .LBB91_3:
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
@@ -178143,169 +178241,176 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 1
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 2
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 3
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 5
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 6
-; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 7
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s11, 1
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s11, 3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s10, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s11, 5
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
; GFX11-FAKE16-NEXT: s_branch .LBB91_2
; GFX11-FAKE16-NEXT: .LBB91_4:
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
-; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
-; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3
-; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
-; GFX11-FAKE16-NEXT: s_mov_b32 s58, s11
-; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v43, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v43, 9
-; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v43, 10
-; GFX11-FAKE16-NEXT: v_readlane_b32 s61, v43, 11
-; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v43, 12
-; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v43, 13
-; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v43, 14
-; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v43, 15
-; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v43, 16
-; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v43, 17
-; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v43, 18
-; GFX11-FAKE16-NEXT: v_readlane_b32 s56, v43, 19
-; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v43, 20
-; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v43, 21
-; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v43, 22
-; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v43, 23
-; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 24
-; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v43, 25
-; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 26
-; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v43, 27
-; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v43, 28
-; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v43, 29
-; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v43, 30
-; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 31
-; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v42, 0
-; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 2
-; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3
-; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5
-; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7
-; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8
-; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v10, s90
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s94 :: v_dual_mov_b32 v16, s30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v42, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s4 :: v_dual_mov_b32 v52, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v51, s40 :: v_dual_mov_b32 v50, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s7 :: v_dual_mov_b32 v49, s98
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s8 :: v_dual_mov_b32 v38, s97
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v36, s58
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v34, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s60 :: v_dual_mov_b32 v32, s61
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v30, s62
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s63 :: v_dual_mov_b32 v29, s96
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s72 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s73 :: v_dual_mov_b32 v24, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s87 :: v_dual_mov_b32 v54, s86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s84 :: v_dual_mov_b32 v66, s85
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v64, s83
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s10 :: v_dual_mov_b32 v70, s81
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s48 :: v_dual_mov_b32 v68, s80
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s82 :: v_dual_mov_b32 v80, s71
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s69 :: v_dual_mov_b32 v20, s70
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s65 :: v_dual_mov_b32 v82, s67
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s68 :: v_dual_mov_b32 v18, s66
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s54 :: v_dual_mov_b32 v84, s64
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s39 :: v_dual_mov_b32 v12, s55
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s38 :: v_dual_mov_b32 v86, s52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v8, s51
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v96, s49
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s50 :: v_dual_mov_b32 v2, s35
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v4, s76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v87, s34 :: v_dual_mov_b32 v6, s88
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s78 :: v_dual_mov_b32 v14, s92
+; GFX11-FAKE16-NEXT: s_mov_b32 s61, s41
+; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v42, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s58, v42, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v42, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v42, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v42, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v42, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v42, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v42, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v42, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s40, v42, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s29, v42, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v42, 19
+; GFX11-FAKE16-NEXT: v_readlane_b32 s28, v42, 20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v42, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v42, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v42, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v42, 24
+; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v42, 25
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v42, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v42, 27
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v42, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v42, 29
+; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v42, 30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v42, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v43, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v43, 3
+; GFX11-FAKE16-NEXT: s_mov_b32 s92, s100
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v43, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v43, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 7
+; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, s102
+; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v43, 9
; GFX11-FAKE16-NEXT: .LBB91_5: ; %end
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s103, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s42, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s44, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s56, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4
; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s102, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s58, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s99, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s61, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5
; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
@@ -178315,15 +178420,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s100, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s99, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s40, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_hi, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s93, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s46, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4
; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6
; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s45, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s95, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s95, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s91, 8
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5
; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
@@ -178335,15 +178440,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s46, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s93, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s8, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s89, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s72, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s92, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s9, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s58, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s88, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
@@ -178353,15 +178458,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s91, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s90, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s12, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s79, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s42, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s89, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s60, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s13, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s78, 8
; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
@@ -178373,14 +178478,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s79, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s78, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s77, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s94, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s77, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s61, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s11, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s60, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s76, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
@@ -178391,14 +178496,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8
; GFX11-FAKE16-NEXT: s_and_b32 s4, s75, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s94, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s90, 8
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s9, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s63, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s74, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
@@ -178414,32 +178519,32 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s28, 8
; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s34, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s11, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s29, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s63, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s56, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s72, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s47, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
; GFX11-FAKE16-NEXT: s_and_b32 s3, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s43, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s40, 8
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s41, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s45, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s92, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s41, 8
; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s13, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s43, 8
; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xff
; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
@@ -178447,160 +178552,160 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_and_b32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_lshlrev_b32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v96, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v96, v16
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v6, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v11, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v22, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v26, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v24, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v29
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v25, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v1, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v87
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v23, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v27, v15
; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v22, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v26, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v83
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v23, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v25, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v26, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v27, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v6, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v11, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v13, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v15, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v16, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v32
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v1, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v8, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v9, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v82
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v31
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v35
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v8, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v10, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v15, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v18, v19
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v34
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v38
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v69
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v14, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v17, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v15
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v36
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v68
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v37
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v1, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v8, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v10, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v18, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v64
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v16, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v20, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v49
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v66
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v10, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v48
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v52
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v54
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v53
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v51
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v50
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v18, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v20, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v21, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v55
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v53
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v52
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v21, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v3, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v10, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v11, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v1, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v5, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v6, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v18, v7
; GFX11-FAKE16-NEXT: s_clause 0x5
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
-; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[24:27], off offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[12:15], off offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[8:11], off offset:96
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v41, 8
; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v41, 7
@@ -178689,39 +178794,40 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v44, v19
-; SI-NEXT: v_mov_b32_e32 v43, v17
-; SI-NEXT: v_mov_b32_e32 v32, v14
-; SI-NEXT: v_mov_b32_e32 v14, v12
-; SI-NEXT: v_mov_b32_e32 v12, v10
-; SI-NEXT: v_mov_b32_e32 v41, v7
-; SI-NEXT: v_mov_b32_e32 v55, v5
-; SI-NEXT: v_mov_b32_e32 v54, v3
-; SI-NEXT: v_mov_b32_e32 v51, v1
-; SI-NEXT: v_mov_b32_e32 v10, v0
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v51, v3
+; SI-NEXT: v_mov_b32_e32 v49, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v0
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:392
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148
@@ -178730,129 +178836,135 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12
; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v53
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v15
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v25
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:168
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212
@@ -178863,27 +178975,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244
@@ -178891,82 +179003,78 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v1
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v2
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340
@@ -178982,79 +179090,64 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:352
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:360
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:216
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344
@@ -179067,6 +179160,265 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT: s_cbranch_execz .LBB92_2
+; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v7, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v49
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
+; SI-NEXT: v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
+; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
+; SI-NEXT: v_and_b32_e32 v34, 0xff, v34
+; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT: v_and_b32_e32 v36, 0xff, v36
+; SI-NEXT: v_and_b32_e32 v50, 0xff, v50
+; SI-NEXT: v_and_b32_e32 v52, 0xff, v52
+; SI-NEXT: v_and_b32_e32 v42, 0xff, v42
+; SI-NEXT: v_or_b32_e32 v54, v42, v54
+; SI-NEXT: v_and_b32_e32 v53, 0xff, v53
+; SI-NEXT: v_and_b32_e32 v41, 0xff, v41
+; SI-NEXT: v_or_b32_e32 v27, v41, v27
+; SI-NEXT: v_and_b32_e32 v41, 0xff, v57
+; SI-NEXT: v_or_b32_e32 v1, v41, v1
+; SI-NEXT: v_and_b32_e32 v41, 0xff, v47
+; SI-NEXT: v_or_b32_e32 v3, v41, v3
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v63
+; SI-NEXT: v_and_b32_e32 v56, 0xff, v56
+; SI-NEXT: v_and_b32_e32 v55, 0xff, v55
+; SI-NEXT: v_or_b32_e32 v56, v56, v61
+; SI-NEXT: v_or_b32_e32 v55, v55, v62
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v51
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v7
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v10
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v12
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v14
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v23, v2, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v7, v2, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v31, v2, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v55
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v49, v2, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v49
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v25, v2, v6
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v25
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; kill: killed $vgpr0
; SI-NEXT: ; implicit-def: $vgpr0
@@ -179140,748 +179492,483 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; kill: killed $vgpr0
; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; SI-NEXT: ; kill: killed $vgpr0
; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT: s_cbranch_execz .LBB92_2
-; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v51
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT: v_and_b32_e32 v26, 0xff, v26
-; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
-; SI-NEXT: v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT: v_and_b32_e32 v39, 0xff, v50
-; SI-NEXT: v_and_b32_e32 v48, 0xff, v40
-; SI-NEXT: v_and_b32_e32 v49, 0xff, v49
-; SI-NEXT: v_and_b32_e32 v52, 0xff, v52
-; SI-NEXT: v_and_b32_e32 v42, 0xff, v42
-; SI-NEXT: v_and_b32_e32 v46, 0xff, v46
-; SI-NEXT: v_or_b32_e32 v45, v46, v45
-; SI-NEXT: v_and_b32_e32 v56, 0xff, v56
-; SI-NEXT: v_or_b32_e32 v56, v56, v61
-; SI-NEXT: v_and_b32_e32 v57, 0xff, v57
-; SI-NEXT: v_and_b32_e32 v47, 0xff, v47
-; SI-NEXT: v_or_b32_e32 v1, v57, v1
-; SI-NEXT: v_or_b32_e32 v3, v47, v3
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v54
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v55
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v2, v6
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v41
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v2, v8
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v9
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v2, v12
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v11
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v2, v14
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v13
-; SI-NEXT: v_mov_b32_e32 v8, v7
-; SI-NEXT: v_mov_b32_e32 v7, v19
-; SI-NEXT: v_or_b32_e32 v19, v2, v32
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xff, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_or_b32_e32 v17, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v23, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v44
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v56
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v31, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v21
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v31
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_or_b32_e32 v51, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v51
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v27, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v4, v2, v4
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v29, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v29, v2, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v29
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v29
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_or_b32_e32 v32, v6, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v11
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v5, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v4, v4, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_or_b32_e32 v11, v6, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v4
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v27
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v33, v6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v33, v6, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v45
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v6, v6, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v6
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v8, v8, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v8
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v12, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v12, v12, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v12
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v14, v14, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v14
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v32, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v32, v32, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v18, v18, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v18, v18, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v18
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v22, v22, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v22, v22, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v22
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v24, v24, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v24, v24, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v24
; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v26, v26, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v26, v26, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v26
; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v28, v28, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v28, v28, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v28
; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v30, v30, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v30, v30, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v34, v34, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v34, v34, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v34
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v16, v16, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v16, v16, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v16
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v20, v20, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v20, v20, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v20
; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v35, v35, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v36, v36, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v13, v13, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v36
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v56
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v37, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v37, v37, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v37
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v8
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v38, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v38, v38, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v38
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v10
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v39, v39, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v36, v36, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v15
+; SI-NEXT: v_or_b32_e32 v12, v12, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v7, v7, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v36
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v7
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v48, v48, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v12
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_or_b32_e32 v15, v15, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xff, v19
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_or_b32_e32 v14, v14, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v15
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v49, v49, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v50, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v50, v50, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v14
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_or_b32_e32 v50, v50, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v50
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v9, v9, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v9
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v17
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v54, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v19
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v35, 0xff, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v54, v54, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v35, v35, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v54
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v52, v52, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v52, v52, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v52
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v37, 0xff, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v11, v11, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v37, v37, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v53, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v38, 0xff, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v53, v53, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v38, v38, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v38
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v39, 0xff, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v39, v39, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v39
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v53, v53, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v53
; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v55, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v55, v55, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v13, v13, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v41, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v41, v41, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v40, 0xff, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v40, v40, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v40
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: v_and_b32_e32 v21, 0xff, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v48
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v48, 0xff, v48
+; SI-NEXT: v_or_b32_e32 v48, v48, v51
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v48
+; SI-NEXT: v_mov_b32_e32 v48, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT: v_or_b32_e32 v0, v0, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v51, 0xff, v51
+; SI-NEXT: v_or_b32_e32 v51, v51, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v51
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v42, v42, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v42, 0xff, v42
+; SI-NEXT: v_or_b32_e32 v40, v42, v40
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v40
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v42, 0xff, v42
+; SI-NEXT: v_or_b32_e32 v42, v42, v43
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v42
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v43, 0xff, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v43, v43, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
+; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v43, 0xff, v43
+; SI-NEXT: v_or_b32_e32 v43, v43, v44
+; SI-NEXT: v_and_b32_e32 v44, 0xff, v46
+; SI-NEXT: v_or_b32_e32 v44, v44, v45
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v44
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
-; SI-NEXT: v_or_b32_e32 v15, v15, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v15
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v44, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v44, v44, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v46, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v46, v46, v58
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v46
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v58, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v58, v58, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v59, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v59, v59, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v59
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v45, 0xff, v45
+; SI-NEXT: v_or_b32_e32 v45, v45, v58
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v46, 0xff, v46
+; SI-NEXT: v_or_b32_e32 v46, v46, v59
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v60, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v60, v60, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v60
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v58, 0xff, v58
+; SI-NEXT: v_or_b32_e32 v58, v58, v60
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v58
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v61, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v25, v61, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v25
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v61, 0xff, v21
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v61, v61, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v44
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v55
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: ; implicit-def: $vgpr0
-; SI-NEXT: ; kill: killed $vgpr0
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v13
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v46
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v41
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: .LBB92_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB92_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v5, v3, v2
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_or_b32_e32 v9, v3, v2
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19
-; SI-NEXT: v_and_b32_e32 v29, 0xff, v29
-; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v11, v63, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v13, v27, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v55
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v21, v62, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_or_b32_e32 v23, v61, v2
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34
-; SI-NEXT: v_and_b32_e32 v34, 0xff, v34
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
-; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
+; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v30
+; SI-NEXT: v_and_b32_e32 v29, 0xff, v29
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
@@ -179899,791 +179986,737 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_mov_b32_e32 v17, v43
-; SI-NEXT: v_mov_b32_e32 v19, v44
-; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19
-; SI-NEXT: v_and_b32_e32 v47, 0xff, v47
-; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15
-; SI-NEXT: v_and_b32_e32 v57, 0xff, v57
-; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: s_movk_i32 s6, 0x300
+; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v51
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49
+; SI-NEXT: v_and_b32_e32 v63, 0xff, v63
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v23, v63, v2
-; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v54
-; SI-NEXT: v_and_b32_e32 v63, 0xff, v63
+; SI-NEXT: v_or_b32_e32 v25, v60, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v25, v25, v2
+; SI-NEXT: v_or_b32_e32 v27, v59, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v31, v62, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56
+; SI-NEXT: v_or_b32_e32 v31, v58, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v33, v61, v2
-; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17
-; SI-NEXT: v_and_b32_e32 v56, 0xff, v56
-; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41
-; SI-NEXT: v_and_b32_e32 v61, 0xff, v61
-; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v55
-; SI-NEXT: v_and_b32_e32 v62, 0xff, v62
+; SI-NEXT: v_or_b32_e32 v32, v45, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v35, v60, v2
-; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9
-; SI-NEXT: v_and_b32_e32 v60, 0xff, v60
-; SI-NEXT: v_or_b32_e32 v12, v12, v60
-; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25
+; SI-NEXT: v_or_b32_e32 v33, v44, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v37, v59, v2
-; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11
-; SI-NEXT: v_and_b32_e32 v59, 0xff, v59
-; SI-NEXT: v_or_b32_e32 v14, v14, v59
-; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v31
+; SI-NEXT: v_or_b32_e32 v35, v43, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v38, v58, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46
+; SI-NEXT: v_or_b32_e32 v37, v40, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v39, v45, v2
-; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21
-; SI-NEXT: v_and_b32_e32 v46, 0xff, v46
-; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13
-; SI-NEXT: v_and_b32_e32 v58, 0xff, v58
-; SI-NEXT: v_or_b32_e32 v32, v32, v58
-; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v33
-; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v32
+; SI-NEXT: v_or_b32_e32 v38, v54, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v48, v0, v2
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v39, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v48, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v54, v0, v2
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v55, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v41, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v50, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v19, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v49
+; SI-NEXT: v_or_b32_e32 v17, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v40
+; SI-NEXT: v_or_b32_e32 v15, v0, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v36
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v2, v0, v2
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v27, 0xff, v27
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v34
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v2, v0, v2
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v29
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v20, v0, v20
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v16, v0, v16
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v0, v0, v20
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v30
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v0, v0, v29
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v28
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v26
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v24
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v22
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20
-; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v16
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v18
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v36
+; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v36
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v0, v0, v30
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v49, 0xff, v49
+; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v34, 0xff, v34
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v49
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v0, v0, v34
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v50, 0xff, v50
+; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v36, 0xff, v36
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v50
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v0, v0, v36
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v52, 0xff, v52
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v54
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v53, 0xff, v53
-; SI-NEXT: v_or_b32_e32 v7, v7, v53
-; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v0, v7, v53
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v48
+; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v40, 0xff, v40
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v40
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v39
+; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v41
+; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v42, 0xff, v42
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_or_b32_e32 v0, v0, v42
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v38
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v43, 0xff, v43
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v43
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v44, 0xff, v44
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v44
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v45, 0xff, v45
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v45
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v48
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v33
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v46, 0xff, v46
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v46
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v39
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v32
+; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v50
+; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v47, 0xff, v47
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_or_b32_e32 v0, v0, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v38
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v31
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v56, 0xff, v56
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v56
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v37
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v27
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v57, 0xff, v57
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v57
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v8, v61
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v6, v62
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v25
+; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v35
-; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23
-; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v63, v0, v63
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v0, v0, v3
+; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v0
+; SI-NEXT: v_and_b32_e32 v58, 0xff, v58
+; SI-NEXT: v_or_b32_e32 v0, v14, v58
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v23
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v0
+; SI-NEXT: v_and_b32_e32 v59, 0xff, v59
+; SI-NEXT: v_or_b32_e32 v0, v12, v59
+; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v21
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v0
+; SI-NEXT: v_and_b32_e32 v60, 0xff, v60
+; SI-NEXT: v_or_b32_e32 v0, v10, v60
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v13
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v0
+; SI-NEXT: v_and_b32_e32 v61, 0xff, v61
+; SI-NEXT: v_or_b32_e32 v0, v8, v61
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v11
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v0
+; SI-NEXT: v_and_b32_e32 v62, 0xff, v62
+; SI-NEXT: v_or_b32_e32 v0, v6, v62
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v63, v0, v63
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v9
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v62
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v60
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v15
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v17
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v19
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v21
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v56
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v59
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v27
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v25
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v61
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v49
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v50
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v51
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v54
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v55
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v41
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0
+; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v4
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v6
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v8
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v10
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v12
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v14
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v16
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v18
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v20
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v22
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v24
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v26
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v28
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v56
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v46
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v30
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v44
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v46
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v57
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v59
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v42
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v44
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v57
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v47
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v42
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v47
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v53
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v53
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v50
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v45
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v40
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v45
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v7
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v9
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v11
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v13
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v15
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v60
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v19
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v21
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v58
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v61
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v25
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v51
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v54
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v55
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v41
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v62
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12
-; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v63
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v14
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v32
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63
; SI-NEXT: v_cvt_f32_f16_e32 v63, v63
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v0
; SI-NEXT: .LBB92_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v10
+; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v10
+; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v5
; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v10
+; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v5
; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180692,9 +180725,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180703,9 +180736,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180714,9 +180747,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180725,9 +180758,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180736,9 +180769,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180747,9 +180780,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180758,9 +180791,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180769,9 +180802,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180780,9 +180813,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180791,9 +180824,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180802,9 +180835,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180813,9 +180846,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180824,9 +180857,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180835,9 +180868,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180846,9 +180879,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180857,9 +180890,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180868,9 +180901,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180879,9 +180912,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180890,9 +180923,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -180901,27 +180934,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v10
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v10
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v10
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -180929,36 +180962,43 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v5
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v37
; SI-NEXT: v_cvt_f16_f32_e32 v1, v29
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v10
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v5
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: v_cvt_f16_f32_e32 v1, v23
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v10
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v5
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v17
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v5
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v5
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v5
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
@@ -185174,11 +185214,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
; SI-NEXT: s_mov_b32 s10, s16
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v61, s29, 0
-; SI-NEXT: v_writelane_b32 v61, s28, 1
-; SI-NEXT: v_writelane_b32 v61, s27, 2
-; SI-NEXT: s_mov_b32 s61, s21
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_writelane_b32 v63, s30, 0
; SI-NEXT: v_writelane_b32 v63, s31, 1
; SI-NEXT: v_writelane_b32 v63, s34, 2
@@ -185213,59 +185249,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_writelane_b32 v63, s87, 31
; SI-NEXT: v_writelane_b32 v63, s96, 32
; SI-NEXT: v_writelane_b32 v63, s97, 33
-; SI-NEXT: s_mov_b32 s67, s19
-; SI-NEXT: s_mov_b32 s54, s17
-; SI-NEXT: s_mov_b32 s35, s23
-; SI-NEXT: s_mov_b32 s39, s26
-; SI-NEXT: s_mov_b32 s62, s25
+; SI-NEXT: s_mov_b32 s54, s27
+; SI-NEXT: s_mov_b32 s79, s29
+; SI-NEXT: s_mov_b32 s66, s26
+; SI-NEXT: s_mov_b32 s64, s23
+; SI-NEXT: s_mov_b32 s65, s19
+; SI-NEXT: s_mov_b32 s67, s17
; SI-NEXT: v_writelane_b32 v63, s98, 34
; SI-NEXT: v_writelane_b32 v63, s99, 35
-; SI-NEXT: v_readfirstlane_b32 s99, v1
-; SI-NEXT: v_readfirstlane_b32 s74, v24
+; SI-NEXT: s_mov_b32 s92, s24
+; SI-NEXT: v_readfirstlane_b32 s31, v1
+; SI-NEXT: v_readfirstlane_b32 s81, v23
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT: v_readfirstlane_b32 s6, v23
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v62, s74, 0
; SI-NEXT: v_readfirstlane_b32 s12, v26
-; SI-NEXT: v_writelane_b32 v62, s6, 1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_writelane_b32 v62, s81, 0
; SI-NEXT: v_readfirstlane_b32 s14, v25
-; SI-NEXT: v_writelane_b32 v62, s12, 2
+; SI-NEXT: v_writelane_b32 v62, s12, 1
; SI-NEXT: v_readfirstlane_b32 s46, v28
-; SI-NEXT: v_writelane_b32 v62, s14, 3
-; SI-NEXT: v_readfirstlane_b32 s56, v27
-; SI-NEXT: v_writelane_b32 v62, s46, 4
-; SI-NEXT: v_readfirstlane_b32 s57, v30
-; SI-NEXT: v_writelane_b32 v62, s56, 5
-; SI-NEXT: v_readfirstlane_b32 s59, v29
-; SI-NEXT: v_writelane_b32 v62, s57, 6
-; SI-NEXT: v_writelane_b32 v62, s59, 7
-; SI-NEXT: s_mov_b32 s60, s20
-; SI-NEXT: s_mov_b32 s63, s24
-; SI-NEXT: v_readfirstlane_b32 s95, v3
-; SI-NEXT: v_readfirstlane_b32 s31, v5
-; SI-NEXT: v_readfirstlane_b32 s24, v9
-; SI-NEXT: v_readfirstlane_b32 s38, v12
+; SI-NEXT: v_writelane_b32 v62, s14, 2
+; SI-NEXT: v_readfirstlane_b32 s57, v27
+; SI-NEXT: v_writelane_b32 v62, s46, 3
+; SI-NEXT: v_readfirstlane_b32 s58, v30
+; SI-NEXT: v_writelane_b32 v62, s57, 4
+; SI-NEXT: s_mov_b32 s77, s25
+; SI-NEXT: v_readfirstlane_b32 s25, v29
+; SI-NEXT: v_writelane_b32 v62, s58, 5
+; SI-NEXT: v_writelane_b32 v62, s25, 6
+; SI-NEXT: v_readfirstlane_b32 s55, v3
+; SI-NEXT: v_readfirstlane_b32 s80, v5
+; SI-NEXT: v_readfirstlane_b32 s51, v7
; SI-NEXT: v_readfirstlane_b32 s36, v11
-; SI-NEXT: v_readfirstlane_b32 s8, v14
-; SI-NEXT: v_readfirstlane_b32 s27, v13
-; SI-NEXT: v_readfirstlane_b32 s9, v16
-; SI-NEXT: v_readfirstlane_b32 s79, v15
+; SI-NEXT: v_readfirstlane_b32 s87, v16
+; SI-NEXT: v_readfirstlane_b32 s84, v15
; SI-NEXT: v_readfirstlane_b32 s13, v18
; SI-NEXT: v_readfirstlane_b32 s15, v17
; SI-NEXT: v_readfirstlane_b32 s42, v20
; SI-NEXT: v_readfirstlane_b32 s43, v19
; SI-NEXT: v_readfirstlane_b32 s44, v22
+; SI-NEXT: v_readfirstlane_b32 s73, v21
+; SI-NEXT: v_readfirstlane_b32 s74, v24
+; SI-NEXT: v_readfirstlane_b32 s62, v14
+; SI-NEXT: v_readfirstlane_b32 s9, v13
+; SI-NEXT: v_readfirstlane_b32 s63, v12
+; SI-NEXT: v_readfirstlane_b32 s61, v10
+; SI-NEXT: v_readfirstlane_b32 s94, v9
+; SI-NEXT: v_readfirstlane_b32 s60, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328
-; SI-NEXT: v_writelane_b32 v61, s4, 3
-; SI-NEXT: v_readfirstlane_b32 s45, v21
-; SI-NEXT: v_readfirstlane_b32 s98, v10
-; SI-NEXT: v_readfirstlane_b32 s90, v8
-; SI-NEXT: v_readfirstlane_b32 s88, v7
-; SI-NEXT: v_readfirstlane_b32 s91, v6
-; SI-NEXT: v_readfirstlane_b32 s93, v4
-; SI-NEXT: v_readfirstlane_b32 s55, v2
+; SI-NEXT: v_writelane_b32 v61, s4, 0
+; SI-NEXT: v_readfirstlane_b32 s35, v6
+; SI-NEXT: v_readfirstlane_b32 s91, v4
+; SI-NEXT: v_readfirstlane_b32 s95, v2
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
@@ -185283,142 +185319,142 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324
-; SI-NEXT: v_writelane_b32 v61, s4, 4
+; SI-NEXT: v_writelane_b32 v61, s4, 1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320
-; SI-NEXT: v_writelane_b32 v61, s4, 5
+; SI-NEXT: v_writelane_b32 v61, s4, 2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316
-; SI-NEXT: v_writelane_b32 v61, s4, 6
+; SI-NEXT: v_writelane_b32 v61, s4, 3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312
-; SI-NEXT: v_writelane_b32 v61, s4, 7
+; SI-NEXT: v_writelane_b32 v61, s4, 4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308
-; SI-NEXT: v_writelane_b32 v61, s4, 8
+; SI-NEXT: v_writelane_b32 v61, s4, 5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304
-; SI-NEXT: v_writelane_b32 v61, s4, 9
+; SI-NEXT: v_writelane_b32 v61, s4, 6
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
-; SI-NEXT: v_writelane_b32 v61, s4, 10
+; SI-NEXT: v_writelane_b32 v61, s4, 7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296
-; SI-NEXT: v_writelane_b32 v61, s4, 11
+; SI-NEXT: v_writelane_b32 v61, s4, 8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292
-; SI-NEXT: v_writelane_b32 v61, s4, 12
+; SI-NEXT: v_writelane_b32 v61, s4, 9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288
-; SI-NEXT: v_writelane_b32 v61, s4, 13
+; SI-NEXT: v_writelane_b32 v61, s4, 10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284
-; SI-NEXT: v_writelane_b32 v61, s4, 14
+; SI-NEXT: v_writelane_b32 v61, s4, 11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280
-; SI-NEXT: v_writelane_b32 v61, s4, 15
+; SI-NEXT: v_writelane_b32 v61, s4, 12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276
-; SI-NEXT: v_writelane_b32 v61, s4, 16
+; SI-NEXT: v_writelane_b32 v61, s4, 13
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272
-; SI-NEXT: v_writelane_b32 v61, s4, 17
+; SI-NEXT: v_writelane_b32 v61, s4, 14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268
-; SI-NEXT: v_writelane_b32 v61, s4, 18
+; SI-NEXT: v_writelane_b32 v61, s4, 15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264
-; SI-NEXT: v_writelane_b32 v61, s4, 19
+; SI-NEXT: v_writelane_b32 v61, s4, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260
-; SI-NEXT: v_writelane_b32 v61, s4, 20
+; SI-NEXT: v_writelane_b32 v61, s4, 17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
-; SI-NEXT: v_writelane_b32 v61, s4, 21
+; SI-NEXT: v_writelane_b32 v61, s4, 18
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
-; SI-NEXT: v_writelane_b32 v61, s4, 22
+; SI-NEXT: v_writelane_b32 v61, s4, 19
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248
-; SI-NEXT: v_writelane_b32 v61, s4, 23
+; SI-NEXT: v_writelane_b32 v61, s4, 20
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244
-; SI-NEXT: v_writelane_b32 v61, s4, 24
+; SI-NEXT: v_writelane_b32 v61, s4, 21
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240
-; SI-NEXT: v_writelane_b32 v61, s4, 25
+; SI-NEXT: v_writelane_b32 v61, s4, 22
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236
-; SI-NEXT: v_writelane_b32 v61, s4, 26
+; SI-NEXT: v_writelane_b32 v61, s4, 23
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232
-; SI-NEXT: v_writelane_b32 v61, s4, 27
+; SI-NEXT: v_writelane_b32 v61, s4, 24
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228
-; SI-NEXT: v_writelane_b32 v61, s4, 28
+; SI-NEXT: v_writelane_b32 v61, s4, 25
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224
-; SI-NEXT: v_writelane_b32 v61, s4, 29
+; SI-NEXT: v_writelane_b32 v61, s4, 26
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v61, s4, 30
+; SI-NEXT: v_writelane_b32 v61, s4, 27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s16, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216
-; SI-NEXT: v_writelane_b32 v61, s4, 31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212
-; SI-NEXT: v_writelane_b32 v61, s4, 32
+; SI-NEXT: v_writelane_b32 v61, s4, 28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s16, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208
+; SI-NEXT: v_writelane_b32 v61, s4, 29
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v61, s4, 33
+; SI-NEXT: v_writelane_b32 v61, s4, 30
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s89, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196
-; SI-NEXT: v_writelane_b32 v61, s4, 34
+; SI-NEXT: v_writelane_b32 v61, s4, 31
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s73, v31
+; SI-NEXT: v_readfirstlane_b32 s93, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188
-; SI-NEXT: v_writelane_b32 v61, s4, 35
+; SI-NEXT: v_writelane_b32 v61, s4, 32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s72, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184
@@ -185426,270 +185462,265 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s40, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s21, v31
+; SI-NEXT: v_readfirstlane_b32 s97, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s85, v31
+; SI-NEXT: v_readfirstlane_b32 s45, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s81, v31
+; SI-NEXT: v_readfirstlane_b32 s85, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s97, v31
+; SI-NEXT: v_readfirstlane_b32 s11, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s7, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT: v_writelane_b32 v61, s4, 33
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s11, v31
+; SI-NEXT: v_readfirstlane_b32 s7, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s41, v31
+; SI-NEXT: v_readfirstlane_b32 s47, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s47, v31
+; SI-NEXT: v_readfirstlane_b32 s41, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s58, v31
+; SI-NEXT: v_readfirstlane_b32 s59, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s76, v31
+; SI-NEXT: v_readfirstlane_b32 s56, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s29, v31
+; SI-NEXT: v_readfirstlane_b32 s78, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s27, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
-; SI-NEXT: v_writelane_b32 v61, s4, 36
-; SI-NEXT: v_writelane_b32 v61, s54, 37
-; SI-NEXT: v_writelane_b32 v61, s10, 38
-; SI-NEXT: v_writelane_b32 v61, s67, 39
-; SI-NEXT: v_writelane_b32 v61, s18, 40
-; SI-NEXT: v_writelane_b32 v61, s61, 41
-; SI-NEXT: v_writelane_b32 v61, s60, 42
-; SI-NEXT: v_writelane_b32 v61, s35, 43
-; SI-NEXT: v_writelane_b32 v61, s22, 44
-; SI-NEXT: v_writelane_b32 v61, s62, 45
-; SI-NEXT: v_writelane_b32 v61, s63, 46
-; SI-NEXT: v_writelane_b32 v61, s39, 47
-; SI-NEXT: v_writelane_b32 v61, s99, 48
-; SI-NEXT: v_writelane_b32 v61, s95, 49
-; SI-NEXT: v_writelane_b32 v61, s31, 50
-; SI-NEXT: v_writelane_b32 v61, s24, 51
-; SI-NEXT: v_writelane_b32 v61, s38, 52
-; SI-NEXT: v_writelane_b32 v61, s36, 53
-; SI-NEXT: v_writelane_b32 v61, s8, 54
-; SI-NEXT: v_writelane_b32 v61, s27, 55
-; SI-NEXT: v_writelane_b32 v61, s9, 56
-; SI-NEXT: v_writelane_b32 v61, s79, 57
-; SI-NEXT: v_writelane_b32 v61, s13, 58
-; SI-NEXT: v_writelane_b32 v61, s15, 59
-; SI-NEXT: v_writelane_b32 v61, s42, 60
-; SI-NEXT: v_writelane_b32 v61, s43, 61
-; SI-NEXT: v_writelane_b32 v61, s44, 62
-; SI-NEXT: v_writelane_b32 v61, s45, 63
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s37, v31
+; SI-NEXT: v_readfirstlane_b32 s39, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s50, v31
+; SI-NEXT: v_readfirstlane_b32 s53, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s48, v31
+; SI-NEXT: v_readfirstlane_b32 s50, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s19, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; SI-NEXT: v_writelane_b32 v61, s4, 34
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s64, v31
+; SI-NEXT: v_readfirstlane_b32 s29, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s17, v31
+; SI-NEXT: v_readfirstlane_b32 s26, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s65, v31
+; SI-NEXT: v_readfirstlane_b32 s23, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s71, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; SI-NEXT: v_writelane_b32 v61, s4, 35
+; SI-NEXT: v_writelane_b32 v61, s67, 36
+; SI-NEXT: v_writelane_b32 v61, s10, 37
+; SI-NEXT: v_writelane_b32 v61, s65, 38
+; SI-NEXT: v_writelane_b32 v61, s18, 39
+; SI-NEXT: v_writelane_b32 v61, s21, 40
+; SI-NEXT: v_writelane_b32 v61, s20, 41
+; SI-NEXT: v_writelane_b32 v61, s64, 42
+; SI-NEXT: v_writelane_b32 v61, s22, 43
+; SI-NEXT: v_writelane_b32 v61, s77, 44
+; SI-NEXT: v_writelane_b32 v61, s92, 45
+; SI-NEXT: v_writelane_b32 v61, s54, 46
+; SI-NEXT: v_writelane_b32 v61, s66, 47
+; SI-NEXT: v_writelane_b32 v61, s79, 48
+; SI-NEXT: v_writelane_b32 v61, s31, 49
+; SI-NEXT: v_writelane_b32 v61, s28, 50
+; SI-NEXT: v_writelane_b32 v61, s55, 51
+; SI-NEXT: v_writelane_b32 v61, s80, 52
+; SI-NEXT: v_writelane_b32 v61, s51, 53
+; SI-NEXT: v_writelane_b32 v61, s36, 54
+; SI-NEXT: v_writelane_b32 v61, s87, 55
+; SI-NEXT: v_writelane_b32 v61, s84, 56
+; SI-NEXT: v_writelane_b32 v61, s13, 57
+; SI-NEXT: v_writelane_b32 v61, s15, 58
+; SI-NEXT: v_writelane_b32 v61, s42, 59
+; SI-NEXT: v_writelane_b32 v61, s43, 60
+; SI-NEXT: v_writelane_b32 v61, s44, 61
+; SI-NEXT: v_writelane_b32 v61, s73, 62
+; SI-NEXT: v_writelane_b32 v61, s74, 63
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s70, v31
+; SI-NEXT: v_readfirstlane_b32 s19, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s83, v31
+; SI-NEXT: v_readfirstlane_b32 s71, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s49, v31
+; SI-NEXT: v_readfirstlane_b32 s17, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s80, v31
+; SI-NEXT: v_readfirstlane_b32 s70, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s82, v31
+; SI-NEXT: v_readfirstlane_b32 s37, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s87, v31
+; SI-NEXT: v_readfirstlane_b32 s82, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s84, v31
+; SI-NEXT: v_readfirstlane_b32 s83, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s51, v31
+; SI-NEXT: v_readfirstlane_b32 s86, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s86, v31
+; SI-NEXT: v_readfirstlane_b32 s30, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s94, v31
+; SI-NEXT: v_readfirstlane_b32 s96, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s96, v31
+; SI-NEXT: v_readfirstlane_b32 s48, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s68, v31
+; SI-NEXT: v_readfirstlane_b32 s98, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s34, v31
+; SI-NEXT: v_readfirstlane_b32 s38, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s77, v31
+; SI-NEXT: v_readfirstlane_b32 s68, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s66, v31
+; SI-NEXT: v_readfirstlane_b32 s99, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s78, v31
+; SI-NEXT: v_readfirstlane_b32 s69, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s53, v31
+; SI-NEXT: v_readfirstlane_b32 s49, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s69, v31
+; SI-NEXT: v_readfirstlane_b32 s6, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s30, v31
+; SI-NEXT: v_readfirstlane_b32 s90, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s52, v31
+; SI-NEXT: v_readfirstlane_b32 s34, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s75, v31
+; SI-NEXT: v_readfirstlane_b32 s52, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s23, v31
+; SI-NEXT: v_readfirstlane_b32 s88, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s28, v31
+; SI-NEXT: v_readfirstlane_b32 s8, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s26, v31
+; SI-NEXT: v_readfirstlane_b32 s24, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s25, v31
+; SI-NEXT: v_readfirstlane_b32 s76, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: v_writelane_b32 v62, s25, 8
-; SI-NEXT: v_writelane_b32 v62, s28, 9
+; SI-NEXT: v_writelane_b32 v62, s76, 7
+; SI-NEXT: v_writelane_b32 v62, s8, 8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s92, v31
-; SI-NEXT: v_writelane_b32 v62, s92, 10
-; SI-NEXT: v_writelane_b32 v62, s75, 11
-; SI-NEXT: v_writelane_b32 v62, s26, 12
-; SI-NEXT: v_writelane_b32 v62, s30, 13
-; SI-NEXT: v_writelane_b32 v62, s23, 14
-; SI-NEXT: v_writelane_b32 v62, s52, 15
-; SI-NEXT: v_writelane_b32 v62, s64, 16
-; SI-NEXT: v_writelane_b32 v62, s17, 17
-; SI-NEXT: v_writelane_b32 v62, s65, 18
-; SI-NEXT: v_writelane_b32 v62, s70, 19
-; SI-NEXT: v_writelane_b32 v62, s71, 20
-; SI-NEXT: v_writelane_b32 v62, s49, 21
-; SI-NEXT: v_writelane_b32 v62, s83, 22
-; SI-NEXT: v_writelane_b32 v62, s80, 23
-; SI-NEXT: v_writelane_b32 v62, s82, 24
-; SI-NEXT: v_writelane_b32 v62, s84, 25
-; SI-NEXT: v_writelane_b32 v62, s87, 26
-; SI-NEXT: v_writelane_b32 v62, s86, 27
-; SI-NEXT: v_writelane_b32 v62, s51, 28
-; SI-NEXT: v_writelane_b32 v62, s96, 29
-; SI-NEXT: v_writelane_b32 v62, s34, 30
-; SI-NEXT: v_writelane_b32 v62, s94, 31
-; SI-NEXT: v_writelane_b32 v62, s53, 32
-; SI-NEXT: v_writelane_b32 v62, s66, 33
-; SI-NEXT: v_writelane_b32 v62, s68, 34
-; SI-NEXT: v_writelane_b32 v62, s69, 35
-; SI-NEXT: v_writelane_b32 v62, s77, 36
-; SI-NEXT: v_writelane_b32 v62, s78, 37
-; SI-NEXT: s_cbranch_scc0 .LBB93_4
+; SI-NEXT: v_readfirstlane_b32 s75, v31
+; SI-NEXT: v_writelane_b32 v62, s75, 9
+; SI-NEXT: v_writelane_b32 v62, s52, 10
+; SI-NEXT: v_writelane_b32 v62, s24, 11
+; SI-NEXT: v_writelane_b32 v62, s90, 12
+; SI-NEXT: v_writelane_b32 v62, s88, 13
+; SI-NEXT: v_writelane_b32 v62, s34, 14
+; SI-NEXT: v_writelane_b32 v62, s17, 15
+; SI-NEXT: v_writelane_b32 v62, s71, 16
+; SI-NEXT: v_writelane_b32 v62, s70, 17
+; SI-NEXT: v_writelane_b32 v62, s37, 18
+; SI-NEXT: v_writelane_b32 v62, s83, 19
+; SI-NEXT: v_writelane_b32 v62, s82, 20
+; SI-NEXT: v_writelane_b32 v62, s30, 21
+; SI-NEXT: v_writelane_b32 v62, s86, 22
+; SI-NEXT: v_writelane_b32 v62, s48, 23
+; SI-NEXT: v_writelane_b32 v62, s38, 24
+; SI-NEXT: v_writelane_b32 v62, s96, 25
+; SI-NEXT: v_writelane_b32 v62, s49, 26
+; SI-NEXT: v_writelane_b32 v62, s99, 27
+; SI-NEXT: v_writelane_b32 v62, s98, 28
+; SI-NEXT: v_writelane_b32 v62, s6, 29
+; SI-NEXT: v_writelane_b32 v62, s68, 30
+; SI-NEXT: v_writelane_b32 v62, s69, 31
+; SI-NEXT: s_cbranch_scc0 .LBB93_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s10, 0xff
-; SI-NEXT: s_lshl_b32 s5, s54, 8
+; SI-NEXT: s_lshl_b32 s5, s67, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
; SI-NEXT: s_and_b32 s4, s18, 0xff
-; SI-NEXT: s_lshl_b32 s5, s67, 8
+; SI-NEXT: s_lshl_b32 s5, s65, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT: s_and_b32 s4, s60, 0xff
-; SI-NEXT: s_lshl_b32 s5, s61, 8
+; SI-NEXT: s_and_b32 s4, s20, 0xff
+; SI-NEXT: s_lshl_b32 s5, s21, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_and_b32 s4, s22, 0xff
-; SI-NEXT: s_lshl_b32 s5, s35, 8
+; SI-NEXT: s_lshl_b32 s5, s64, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: s_and_b32 s4, s63, 0xff
-; SI-NEXT: s_lshl_b32 s5, s62, 8
+; SI-NEXT: s_and_b32 s4, s92, 0xff
+; SI-NEXT: s_lshl_b32 s5, s77, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_readlane_b32 s5, v61, 2
-; SI-NEXT: s_and_b32 s4, s39, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: s_and_b32 s4, s66, 0xff
+; SI-NEXT: s_lshl_b32 s5, s54, 8
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
+; SI-NEXT: s_and_b32 s4, s28, 0xff
+; SI-NEXT: s_lshl_b32 s5, s79, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_readlane_b32 s4, v61, 1
-; SI-NEXT: v_readlane_b32 s5, v61, 0
-; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
-; SI-NEXT: s_and_b32 s4, s99, 0xff
-; SI-NEXT: s_lshl_b32 s5, s55, 8
+; SI-NEXT: s_and_b32 s4, s31, 0xff
+; SI-NEXT: s_lshl_b32 s5, s95, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v10, s4
-; SI-NEXT: s_and_b32 s4, s95, 0xff
-; SI-NEXT: s_lshl_b32 s5, s93, 8
+; SI-NEXT: s_and_b32 s4, s55, 0xff
+; SI-NEXT: s_lshl_b32 s5, s91, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v8, s4
-; SI-NEXT: s_and_b32 s4, s31, 0xff
-; SI-NEXT: s_lshl_b32 s5, s91, 8
+; SI-NEXT: s_and_b32 s4, s80, 0xff
+; SI-NEXT: s_lshl_b32 s5, s35, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v11, s4
-; SI-NEXT: s_and_b32 s4, s88, 0xff
-; SI-NEXT: s_lshl_b32 s5, s90, 8
+; SI-NEXT: v_cvt_f32_f16_e32 v12, s4
+; SI-NEXT: s_and_b32 s4, s51, 0xff
+; SI-NEXT: s_lshl_b32 s5, s60, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v9, s4
-; SI-NEXT: s_and_b32 s4, s24, 0xff
-; SI-NEXT: s_lshl_b32 s5, s98, 8
+; SI-NEXT: s_and_b32 s4, s94, 0xff
+; SI-NEXT: s_lshl_b32 s5, s61, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v13, s4
; SI-NEXT: s_and_b32 s4, s36, 0xff
-; SI-NEXT: s_lshl_b32 s5, s38, 8
+; SI-NEXT: s_lshl_b32 s5, s63, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s4
-; SI-NEXT: s_and_b32 s4, s27, 0xff
-; SI-NEXT: s_lshl_b32 s5, s8, 8
+; SI-NEXT: v_cvt_f32_f16_e32 v11, s4
+; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s5, s62, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v15, s4
-; SI-NEXT: s_and_b32 s4, s79, 0xff
-; SI-NEXT: s_lshl_b32 s5, s9, 8
+; SI-NEXT: s_and_b32 s4, s84, 0xff
+; SI-NEXT: s_lshl_b32 s5, s87, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v14, s4
; SI-NEXT: s_and_b32 s4, s15, 0xff
@@ -185700,11 +185731,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s5, s42, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v16, s4
-; SI-NEXT: s_and_b32 s4, s45, 0xff
+; SI-NEXT: s_and_b32 s4, s73, 0xff
; SI-NEXT: s_lshl_b32 s5, s44, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT: s_and_b32 s4, s6, 0xff
+; SI-NEXT: s_and_b32 s4, s81, 0xff
; SI-NEXT: s_lshl_b32 s5, s74, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v18, s4
@@ -185712,244 +185743,393 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s5, s12, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v21, s4
-; SI-NEXT: s_and_b32 s4, s56, 0xff
+; SI-NEXT: s_and_b32 s4, s57, 0xff
; SI-NEXT: s_lshl_b32 s5, s46, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v20, s4
-; SI-NEXT: s_and_b32 s4, s59, 0xff
-; SI-NEXT: s_lshl_b32 s5, s57, 8
+; SI-NEXT: s_and_b32 s4, s25, 0xff
+; SI-NEXT: s_lshl_b32 s5, s58, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT: s_and_b32 s4, s92, 0xff
-; SI-NEXT: s_lshl_b32 s5, s25, 8
+; SI-NEXT: s_and_b32 s4, s75, 0xff
+; SI-NEXT: s_lshl_b32 s5, s76, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v22, s4
-; SI-NEXT: s_and_b32 s4, s26, 0xff
-; SI-NEXT: s_lshl_b32 s5, s28, 8
+; SI-NEXT: s_and_b32 s4, s24, 0xff
+; SI-NEXT: s_lshl_b32 s5, s8, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT: s_and_b32 s4, s23, 0xff
-; SI-NEXT: s_lshl_b32 s5, s75, 8
+; SI-NEXT: s_and_b32 s4, s88, 0xff
+; SI-NEXT: s_lshl_b32 s5, s52, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v24, s4
-; SI-NEXT: s_and_b32 s4, s52, 0xff
-; SI-NEXT: s_lshl_b32 s5, s30, 8
+; SI-NEXT: s_and_b32 s4, s34, 0xff
+; SI-NEXT: s_lshl_b32 s5, s90, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT: s_and_b32 s4, s69, 0xff
-; SI-NEXT: s_lshl_b32 s5, s53, 8
+; SI-NEXT: s_and_b32 s4, s6, 0xff
+; SI-NEXT: s_lshl_b32 s5, s49, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v26, s4
-; SI-NEXT: s_and_b32 s4, s78, 0xff
-; SI-NEXT: s_lshl_b32 s5, s66, 8
+; SI-NEXT: s_and_b32 s4, s69, 0xff
+; SI-NEXT: s_lshl_b32 s5, s99, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT: s_and_b32 s4, s77, 0xff
-; SI-NEXT: s_lshl_b32 s5, s34, 8
+; SI-NEXT: s_and_b32 s4, s68, 0xff
+; SI-NEXT: s_lshl_b32 s5, s38, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v28, s4
-; SI-NEXT: s_and_b32 s4, s68, 0xff
-; SI-NEXT: s_lshl_b32 s5, s96, 8
+; SI-NEXT: s_and_b32 s4, s98, 0xff
+; SI-NEXT: s_lshl_b32 s5, s48, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v31, s4
-; SI-NEXT: s_and_b32 s4, s94, 0xff
-; SI-NEXT: s_lshl_b32 s5, s86, 8
+; SI-NEXT: s_and_b32 s4, s96, 0xff
+; SI-NEXT: s_lshl_b32 s5, s30, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v30, s4
-; SI-NEXT: s_and_b32 s4, s51, 0xff
-; SI-NEXT: s_lshl_b32 s5, s84, 8
+; SI-NEXT: s_and_b32 s4, s86, 0xff
+; SI-NEXT: s_lshl_b32 s5, s83, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v33, s4
-; SI-NEXT: s_and_b32 s4, s87, 0xff
-; SI-NEXT: s_lshl_b32 s5, s82, 8
+; SI-NEXT: s_and_b32 s4, s82, 0xff
+; SI-NEXT: s_lshl_b32 s5, s37, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v32, s4
-; SI-NEXT: s_and_b32 s4, s80, 0xff
-; SI-NEXT: s_lshl_b32 s5, s49, 8
+; SI-NEXT: s_and_b32 s4, s70, 0xff
+; SI-NEXT: s_lshl_b32 s5, s17, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v35, s4
-; SI-NEXT: s_and_b32 s4, s83, 0xff
-; SI-NEXT: s_lshl_b32 s5, s70, 8
+; SI-NEXT: s_and_b32 s4, s71, 0xff
+; SI-NEXT: s_lshl_b32 s5, s19, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s13, v61, 35
; SI-NEXT: v_cvt_f32_f16_e32 v34, s4
-; SI-NEXT: s_and_b32 s4, s71, 0xff
-; SI-NEXT: s_lshl_b32 s5, s65, 8
+; SI-NEXT: s_and_b32 s4, s13, 0xff
+; SI-NEXT: s_lshl_b32 s5, s23, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v37, s4
-; SI-NEXT: s_and_b32 s4, s17, 0xff
-; SI-NEXT: s_lshl_b32 s5, s64, 8
+; SI-NEXT: s_and_b32 s4, s26, 0xff
+; SI-NEXT: s_lshl_b32 s5, s29, 8
+; SI-NEXT: s_mov_b32 s79, s9
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s9, v61, 34
; SI-NEXT: v_cvt_f32_f16_e32 v36, s4
-; SI-NEXT: s_and_b32 s4, s19, 0xff
-; SI-NEXT: s_lshl_b32 s5, s48, 8
+; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s5, s50, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v39, s4
-; SI-NEXT: s_and_b32 s4, s50, 0xff
-; SI-NEXT: s_lshl_b32 s5, s37, 8
+; SI-NEXT: s_and_b32 s4, s53, 0xff
+; SI-NEXT: s_lshl_b32 s5, s39, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s8, v61, 36
; SI-NEXT: v_cvt_f32_f16_e32 v38, s4
-; SI-NEXT: s_and_b32 s4, s8, 0xff
-; SI-NEXT: s_lshl_b32 s5, s29, 8
+; SI-NEXT: s_and_b32 s4, s27, 0xff
+; SI-NEXT: s_lshl_b32 s5, s78, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v49, s4
-; SI-NEXT: s_and_b32 s4, s76, 0xff
-; SI-NEXT: s_lshl_b32 s5, s58, 8
+; SI-NEXT: s_and_b32 s4, s56, 0xff
+; SI-NEXT: s_lshl_b32 s5, s59, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v48, s4
-; SI-NEXT: s_and_b32 s4, s47, 0xff
-; SI-NEXT: s_lshl_b32 s5, s41, 8
+; SI-NEXT: s_and_b32 s4, s41, 0xff
+; SI-NEXT: s_lshl_b32 s5, s47, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s21, v61, 33
; SI-NEXT: v_cvt_f32_f16_e32 v51, s4
-; SI-NEXT: s_and_b32 s4, s11, 0xff
-; SI-NEXT: s_lshl_b32 s5, s7, 8
+; SI-NEXT: s_and_b32 s4, s7, 0xff
+; SI-NEXT: s_lshl_b32 s5, s21, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v50, s4
-; SI-NEXT: s_and_b32 s4, s97, 0xff
-; SI-NEXT: s_lshl_b32 s5, s81, 8
+; SI-NEXT: s_and_b32 s4, s11, 0xff
+; SI-NEXT: s_lshl_b32 s5, s85, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v53, s4
-; SI-NEXT: s_and_b32 s4, s85, 0xff
-; SI-NEXT: s_lshl_b32 s5, s21, 8
+; SI-NEXT: s_and_b32 s4, s45, 0xff
+; SI-NEXT: s_lshl_b32 s5, s97, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v52, s4
; SI-NEXT: s_and_b32 s4, s40, 0xff
; SI-NEXT: s_lshl_b32 s5, s72, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s69, v61, 35
+; SI-NEXT: v_readlane_b32 s69, v61, 32
; SI-NEXT: v_cvt_f32_f16_e32 v55, s4
; SI-NEXT: s_and_b32 s4, s69, 0xff
-; SI-NEXT: s_lshl_b32 s5, s73, 8
+; SI-NEXT: s_lshl_b32 s5, s93, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s68, v61, 34
+; SI-NEXT: v_readlane_b32 s68, v61, 31
; SI-NEXT: v_cvt_f32_f16_e32 v54, s4
; SI-NEXT: s_and_b32 s4, s68, 0xff
; SI-NEXT: s_lshl_b32 s5, s89, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s66, v61, 33
+; SI-NEXT: v_readlane_b32 s66, v61, 30
+; SI-NEXT: v_readlane_b32 s20, v61, 29
; SI-NEXT: v_cvt_f32_f16_e32 v41, s4
; SI-NEXT: s_and_b32 s4, s66, 0xff
-; SI-NEXT: s_lshl_b32 s5, s16, 8
+; SI-NEXT: s_lshl_b32 s5, s20, 8
+; SI-NEXT: s_mov_b32 s17, s19
+; SI-NEXT: s_mov_b32 s19, s23
+; SI-NEXT: s_mov_b32 s23, s26
+; SI-NEXT: s_mov_b32 s26, s29
+; SI-NEXT: s_mov_b32 s29, s53
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s53, v61, 32
-; SI-NEXT: v_readlane_b32 s94, v61, 31
+; SI-NEXT: v_readlane_b32 s53, v61, 28
; SI-NEXT: v_cvt_f32_f16_e32 v40, s4
; SI-NEXT: s_and_b32 s4, s53, 0xff
-; SI-NEXT: s_lshl_b32 s5, s94, 8
+; SI-NEXT: s_lshl_b32 s5, s16, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s34, v61, 30
-; SI-NEXT: v_readlane_b32 s96, v61, 29
+; SI-NEXT: v_readlane_b32 s34, v61, 27
+; SI-NEXT: v_readlane_b32 s6, v61, 26
; SI-NEXT: v_cvt_f32_f16_e32 v43, s4
; SI-NEXT: s_and_b32 s4, s34, 0xff
-; SI-NEXT: s_lshl_b32 s5, s96, 8
+; SI-NEXT: s_lshl_b32 s5, s6, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s51, v61, 28
-; SI-NEXT: v_readlane_b32 s86, v61, 27
+; SI-NEXT: v_readlane_b32 s98, v61, 25
+; SI-NEXT: v_readlane_b32 s99, v61, 24
; SI-NEXT: v_cvt_f32_f16_e32 v42, s4
-; SI-NEXT: s_and_b32 s4, s51, 0xff
-; SI-NEXT: s_lshl_b32 s5, s86, 8
+; SI-NEXT: s_and_b32 s4, s98, 0xff
+; SI-NEXT: s_lshl_b32 s5, s99, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s87, v61, 26
-; SI-NEXT: v_readlane_b32 s84, v61, 25
+; SI-NEXT: v_readlane_b32 s49, v61, 23
+; SI-NEXT: v_readlane_b32 s96, v61, 22
; SI-NEXT: v_cvt_f32_f16_e32 v45, s4
-; SI-NEXT: s_and_b32 s4, s87, 0xff
-; SI-NEXT: s_lshl_b32 s5, s84, 8
+; SI-NEXT: s_and_b32 s4, s49, 0xff
+; SI-NEXT: s_lshl_b32 s5, s96, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s82, v61, 24
-; SI-NEXT: v_readlane_b32 s80, v61, 23
+; SI-NEXT: v_readlane_b32 s38, v61, 21
+; SI-NEXT: v_readlane_b32 s48, v61, 20
; SI-NEXT: v_cvt_f32_f16_e32 v44, s4
-; SI-NEXT: s_and_b32 s4, s82, 0xff
-; SI-NEXT: s_lshl_b32 s5, s80, 8
+; SI-NEXT: s_and_b32 s4, s38, 0xff
+; SI-NEXT: s_lshl_b32 s5, s48, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s83, v61, 22
-; SI-NEXT: v_readlane_b32 s49, v61, 21
+; SI-NEXT: v_readlane_b32 s86, v61, 19
+; SI-NEXT: v_readlane_b32 s30, v61, 18
; SI-NEXT: v_cvt_f32_f16_e32 v47, s4
-; SI-NEXT: s_and_b32 s4, s83, 0xff
-; SI-NEXT: s_lshl_b32 s5, s49, 8
+; SI-NEXT: s_and_b32 s4, s86, 0xff
+; SI-NEXT: s_lshl_b32 s5, s30, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s71, v61, 20
-; SI-NEXT: v_readlane_b32 s70, v61, 19
+; SI-NEXT: v_readlane_b32 s82, v61, 17
+; SI-NEXT: v_readlane_b32 s83, v61, 16
; SI-NEXT: v_cvt_f32_f16_e32 v46, s4
-; SI-NEXT: s_and_b32 s4, s71, 0xff
-; SI-NEXT: s_lshl_b32 s5, s70, 8
+; SI-NEXT: s_and_b32 s4, s82, 0xff
+; SI-NEXT: s_lshl_b32 s5, s83, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s65, v61, 18
-; SI-NEXT: v_readlane_b32 s54, v61, 17
+; SI-NEXT: v_readlane_b32 s37, v61, 15
+; SI-NEXT: v_readlane_b32 s70, v61, 14
; SI-NEXT: v_cvt_f32_f16_e32 v57, s4
-; SI-NEXT: s_and_b32 s4, s65, 0xff
-; SI-NEXT: s_lshl_b32 s5, s54, 8
-; SI-NEXT: s_mov_b32 s17, s19
-; SI-NEXT: s_mov_b32 s19, s50
+; SI-NEXT: s_and_b32 s4, s37, 0xff
+; SI-NEXT: s_lshl_b32 s5, s70, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s67, v61, 16
-; SI-NEXT: v_readlane_b32 s50, v61, 15
+; SI-NEXT: v_readlane_b32 s71, v61, 13
+; SI-NEXT: v_readlane_b32 s67, v61, 12
; SI-NEXT: v_cvt_f32_f16_e32 v56, s4
-; SI-NEXT: s_and_b32 s4, s67, 0xff
-; SI-NEXT: s_lshl_b32 s5, s50, 8
+; SI-NEXT: s_and_b32 s4, s71, 0xff
+; SI-NEXT: s_lshl_b32 s5, s67, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s64, v61, 14
-; SI-NEXT: v_readlane_b32 s52, v61, 13
+; SI-NEXT: v_readlane_b32 s65, v61, 11
+; SI-NEXT: v_readlane_b32 s64, v61, 10
; SI-NEXT: v_cvt_f32_f16_e32 v59, s4
-; SI-NEXT: s_and_b32 s4, s64, 0xff
-; SI-NEXT: s_lshl_b32 s5, s52, 8
-; SI-NEXT: s_mov_b32 s23, s48
+; SI-NEXT: s_and_b32 s4, s65, 0xff
+; SI-NEXT: s_lshl_b32 s5, s64, 8
+; SI-NEXT: s_mov_b32 s88, s50
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s35, v61, 12
-; SI-NEXT: v_readlane_b32 s48, v61, 11
+; SI-NEXT: v_readlane_b32 s54, v61, 9
+; SI-NEXT: v_readlane_b32 s50, v61, 8
; SI-NEXT: v_cvt_f32_f16_e32 v58, s4
-; SI-NEXT: s_and_b32 s4, s35, 0xff
-; SI-NEXT: s_lshl_b32 s5, s48, 8
+; SI-NEXT: s_and_b32 s4, s54, 0xff
+; SI-NEXT: s_lshl_b32 s5, s50, 8
+; SI-NEXT: s_mov_b32 s24, s39
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s30, v61, 10
-; SI-NEXT: v_readlane_b32 s39, v61, 9
+; SI-NEXT: v_readlane_b32 s90, v61, 7
+; SI-NEXT: v_readlane_b32 s39, v61, 6
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: s_and_b32 s4, s30, 0xff
+; SI-NEXT: s_and_b32 s4, s90, 0xff
; SI-NEXT: s_lshl_b32 s5, s39, 8
-; SI-NEXT: s_mov_b32 s26, s37
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s37, v61, 8
-; SI-NEXT: v_readlane_b32 s75, v61, 7
+; SI-NEXT: v_readlane_b32 s52, v61, 5
+; SI-NEXT: v_readlane_b32 s75, v61, 4
; SI-NEXT: v_cvt_f32_f16_e32 v60, s4
-; SI-NEXT: s_and_b32 s4, s37, 0xff
+; SI-NEXT: s_and_b32 s4, s52, 0xff
; SI-NEXT: s_lshl_b32 s5, s75, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s92, v61, 6
-; SI-NEXT: v_readlane_b32 s77, v61, 5
+; SI-NEXT: v_readlane_b32 s92, v61, 3
+; SI-NEXT: v_readlane_b32 s77, v61, 2
; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
; SI-NEXT: s_and_b32 s4, s92, 0xff
; SI-NEXT: s_lshl_b32 s5, s77, 8
-; SI-NEXT: s_mov_b32 s28, s29
-; SI-NEXT: s_mov_b32 s29, s76
+; SI-NEXT: s_mov_b32 s8, s27
+; SI-NEXT: s_mov_b32 s27, s78
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s78, v61, 4
-; SI-NEXT: v_readlane_b32 s76, v61, 3
+; SI-NEXT: v_readlane_b32 s78, v61, 1
+; SI-NEXT: v_readlane_b32 s76, v61, 0
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
; SI-NEXT: s_and_b32 s4, s78, 0xff
; SI-NEXT: s_lshl_b32 s5, s76, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_mov_b32 s99, s55
-; SI-NEXT: s_mov_b32 s20, s88
-; SI-NEXT: s_mov_b32 s24, s98
-; SI-NEXT: s_mov_b32 s59, s58
-; SI-NEXT: s_mov_b32 s56, s47
-; SI-NEXT: s_mov_b32 s46, s41
-; SI-NEXT: s_mov_b32 s12, s11
-; SI-NEXT: s_mov_b32 s11, s7
-; SI-NEXT: s_mov_b32 s7, s97
-; SI-NEXT: s_mov_b32 s97, s81
-; SI-NEXT: s_mov_b32 s81, s85
-; SI-NEXT: s_mov_b32 s6, s40
+; SI-NEXT: s_mov_b32 s31, s95
+; SI-NEXT: s_mov_b32 s57, s56
+; SI-NEXT: s_mov_b32 s25, s59
+; SI-NEXT: s_mov_b32 s14, s41
+; SI-NEXT: s_mov_b32 s46, s47
+; SI-NEXT: s_mov_b32 s12, s7
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: s_mov_b32 s11, s85
+; SI-NEXT: s_mov_b32 s81, s45
+; SI-NEXT: s_mov_b32 s85, s97
+; SI-NEXT: s_mov_b32 s45, s40
; SI-NEXT: s_mov_b32 s40, s72
-; SI-NEXT: s_mov_b32 s45, s73
+; SI-NEXT: s_mov_b32 s44, s93
; SI-NEXT: s_mov_b32 s15, s89
; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_mov_b32 s55, s93
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: s_mov_b32 s95, s91
+; SI-NEXT: s_mov_b32 s55, s35
+; SI-NEXT: s_mov_b32 s80, s60
+; SI-NEXT: s_mov_b32 s91, s61
+; SI-NEXT: s_mov_b32 s51, s63
+; SI-NEXT: s_mov_b32 s36, s62
+; SI-NEXT: s_branch .LBB93_3
+; SI-NEXT: .LBB93_2:
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: s_mov_b32 s17, s19
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: s_mov_b32 s19, s23
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: s_mov_b32 s23, s26
+; SI-NEXT: s_mov_b32 s26, s29
+; SI-NEXT: s_mov_b32 s29, s53
+; SI-NEXT: s_mov_b32 s88, s50
+; SI-NEXT: s_mov_b32 s24, s39
+; SI-NEXT: s_mov_b32 s8, s27
+; SI-NEXT: s_mov_b32 s27, s78
+; SI-NEXT: s_mov_b32 s25, s59
+; SI-NEXT: s_mov_b32 s57, s56
+; SI-NEXT: s_mov_b32 s46, s47
+; SI-NEXT: s_mov_b32 s14, s41
+; SI-NEXT: s_mov_b32 s12, s7
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: s_mov_b32 s11, s85
+; SI-NEXT: s_mov_b32 s85, s97
+; SI-NEXT: s_mov_b32 s81, s45
+; SI-NEXT: s_mov_b32 s45, s40
+; SI-NEXT: s_mov_b32 s40, s72
+; SI-NEXT: s_mov_b32 s44, s93
+; SI-NEXT: s_mov_b32 s15, s89
+; SI-NEXT: s_mov_b32 s79, s9
+; SI-NEXT: s_mov_b32 s31, s95
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_readlane_b32 s75, v61, 4
+; SI-NEXT: v_readlane_b32 s76, v61, 0
+; SI-NEXT: v_readlane_b32 s77, v61, 2
+; SI-NEXT: v_readlane_b32 s78, v61, 1
+; SI-NEXT: v_readlane_b32 s92, v61, 3
+; SI-NEXT: v_readlane_b32 s39, v61, 6
+; SI-NEXT: v_readlane_b32 s52, v61, 5
+; SI-NEXT: v_readlane_b32 s90, v61, 7
+; SI-NEXT: v_readlane_b32 s50, v61, 8
+; SI-NEXT: v_readlane_b32 s64, v61, 10
+; SI-NEXT: v_readlane_b32 s54, v61, 9
+; SI-NEXT: v_readlane_b32 s67, v61, 12
+; SI-NEXT: v_readlane_b32 s65, v61, 11
+; SI-NEXT: v_readlane_b32 s70, v61, 14
+; SI-NEXT: v_readlane_b32 s71, v61, 13
+; SI-NEXT: v_readlane_b32 s37, v61, 15
+; SI-NEXT: v_readlane_b32 s83, v61, 16
+; SI-NEXT: v_readlane_b32 s30, v61, 18
+; SI-NEXT: v_readlane_b32 s82, v61, 17
+; SI-NEXT: v_readlane_b32 s48, v61, 20
+; SI-NEXT: v_readlane_b32 s86, v61, 19
+; SI-NEXT: v_readlane_b32 s96, v61, 22
+; SI-NEXT: v_readlane_b32 s38, v61, 21
+; SI-NEXT: v_readlane_b32 s49, v61, 23
+; SI-NEXT: v_readlane_b32 s99, v61, 24
+; SI-NEXT: v_readlane_b32 s6, v61, 26
+; SI-NEXT: v_readlane_b32 s98, v61, 25
; SI-NEXT: s_mov_b32 s95, s91
-; SI-NEXT: s_mov_b32 s31, s90
-; SI-NEXT: s_cbranch_execnz .LBB93_3
-; SI-NEXT: .LBB93_2: ; %cmp.true
+; SI-NEXT: s_mov_b32 s55, s35
+; SI-NEXT: s_mov_b32 s80, s60
+; SI-NEXT: v_readlane_b32 s20, v61, 29
+; SI-NEXT: s_mov_b32 s91, s61
+; SI-NEXT: s_mov_b32 s51, s63
+; SI-NEXT: s_mov_b32 s36, s62
+; SI-NEXT: v_readlane_b32 s34, v61, 27
+; SI-NEXT: v_readlane_b32 s53, v61, 28
+; SI-NEXT: v_readlane_b32 s66, v61, 30
+; SI-NEXT: v_readlane_b32 s68, v61, 31
+; SI-NEXT: v_readlane_b32 s69, v61, 32
+; SI-NEXT: v_readlane_b32 s21, v61, 33
+; SI-NEXT: v_readlane_b32 s9, v61, 34
+; SI-NEXT: v_readlane_b32 s13, v61, 35
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: .LBB93_3: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_mov_b32 s35, s79
+; SI-NEXT: s_cbranch_vccnz .LBB93_5
+; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: s_add_i32 s4, s78, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s76, 8
@@ -185958,60 +186138,60 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_and_b32 s5, s5, 0xff
; SI-NEXT: s_lshl_b32 vcc_lo, s77, 8
; SI-NEXT: s_or_b32 s5, vcc_lo, s5
-; SI-NEXT: s_add_i32 vcc_lo, s37, 3
+; SI-NEXT: s_add_i32 vcc_lo, s52, 3
; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff
; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8
; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo
-; SI-NEXT: s_add_i32 vcc_hi, s30, 3
+; SI-NEXT: s_add_i32 vcc_hi, s90, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s60, s39, 8
; SI-NEXT: s_or_b32 s60, s60, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s35, 3
+; SI-NEXT: s_add_i32 vcc_hi, s54, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s61, s48, 8
+; SI-NEXT: s_lshl_b32 s61, s50, 8
; SI-NEXT: s_or_b32 s61, s61, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s64, 3
+; SI-NEXT: s_add_i32 vcc_hi, s65, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s62, s52, 8
+; SI-NEXT: s_lshl_b32 s62, s64, 8
; SI-NEXT: s_or_b32 s62, s62, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s67, 3
+; SI-NEXT: s_add_i32 vcc_hi, s71, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s63, s50, 8
+; SI-NEXT: s_lshl_b32 s63, s67, 8
; SI-NEXT: s_or_b32 s10, s63, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s65, 3
+; SI-NEXT: s_add_i32 vcc_hi, s37, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s72, s54, 8
+; SI-NEXT: s_lshl_b32 s72, s70, 8
; SI-NEXT: s_or_b32 s72, s72, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s71, 3
+; SI-NEXT: s_add_i32 vcc_hi, s82, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s73, s70, 8
+; SI-NEXT: s_lshl_b32 s73, s83, 8
; SI-NEXT: s_or_b32 s73, s73, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s83, 3
+; SI-NEXT: s_add_i32 vcc_hi, s86, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s74, s49, 8
+; SI-NEXT: s_lshl_b32 s74, s30, 8
; SI-NEXT: s_or_b32 s74, s74, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s82, 3
+; SI-NEXT: s_add_i32 vcc_hi, s38, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s75, s80, 8
+; SI-NEXT: s_lshl_b32 s75, s48, 8
; SI-NEXT: s_or_b32 s75, s75, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s87, 3
+; SI-NEXT: s_add_i32 vcc_hi, s49, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s76, s84, 8
+; SI-NEXT: s_lshl_b32 s76, s96, 8
; SI-NEXT: s_or_b32 s76, s76, vcc_hi
-; SI-NEXT: s_add_i32 vcc_hi, s51, 3
+; SI-NEXT: s_add_i32 vcc_hi, s98, 3
; SI-NEXT: s_add_i32 s93, s53, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT: s_lshl_b32 s77, s86, 8
+; SI-NEXT: s_lshl_b32 s77, s99, 8
; SI-NEXT: s_add_i32 s89, s34, 3
; SI-NEXT: s_and_b32 s93, s93, 0xff
-; SI-NEXT: s_lshl_b32 s78, s94, 8
+; SI-NEXT: s_lshl_b32 s78, s16, 8
; SI-NEXT: s_add_i32 s34, s66, 3
; SI-NEXT: s_or_b32 s77, s77, vcc_hi
; SI-NEXT: s_and_b32 s89, s89, 0xff
-; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8
+; SI-NEXT: s_lshl_b32 vcc_hi, s6, 8
; SI-NEXT: s_or_b32 s22, s78, s93
; SI-NEXT: s_and_b32 s93, s34, 0xff
-; SI-NEXT: s_lshl_b32 s92, s16, 8
+; SI-NEXT: s_lshl_b32 s92, s20, 8
; SI-NEXT: s_add_i32 s53, s68, 3
; SI-NEXT: s_or_b32 s89, vcc_hi, s89
; SI-NEXT: s_or_b32 s92, s92, s93
@@ -186020,261 +186200,251 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_add_i32 s66, s69, 3
; SI-NEXT: s_or_b32 s93, vcc_hi, s93
; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff
-; SI-NEXT: s_lshl_b32 s34, s45, 8
-; SI-NEXT: s_add_i32 s68, s6, 3
+; SI-NEXT: s_lshl_b32 s34, s44, 8
+; SI-NEXT: s_add_i32 s68, s45, 3
; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi
; SI-NEXT: s_and_b32 s34, s68, 0xff
; SI-NEXT: s_lshl_b32 s39, s40, 8
; SI-NEXT: s_add_i32 s69, s81, 3
; SI-NEXT: s_or_b32 s34, s39, s34
; SI-NEXT: s_and_b32 s39, s69, 0xff
-; SI-NEXT: s_lshl_b32 s52, s21, 8
+; SI-NEXT: s_lshl_b32 s52, s85, 8
; SI-NEXT: s_add_i32 s81, s7, 3
; SI-NEXT: s_or_b32 s39, s52, s39
; SI-NEXT: s_and_b32 s52, s81, 0xff
-; SI-NEXT: s_lshl_b32 s53, s97, 8
+; SI-NEXT: s_lshl_b32 s53, s11, 8
; SI-NEXT: s_add_i32 s85, s12, 3
; SI-NEXT: s_or_b32 s52, s53, s52
; SI-NEXT: s_and_b32 s53, s85, 0xff
-; SI-NEXT: s_lshl_b32 s64, s11, 8
-; SI-NEXT: s_add_i32 s97, s56, 3
+; SI-NEXT: s_lshl_b32 s64, s21, 8
+; SI-NEXT: s_add_i32 s97, s14, 3
; SI-NEXT: s_or_b32 s53, s64, s53
; SI-NEXT: s_and_b32 s64, s97, 0xff
; SI-NEXT: s_lshl_b32 s66, s46, 8
-; SI-NEXT: s_add_i32 s21, s29, 3
+; SI-NEXT: s_add_i32 s21, s57, 3
; SI-NEXT: s_or_b32 s64, s66, s64
; SI-NEXT: s_and_b32 s21, s21, 0xff
-; SI-NEXT: s_lshl_b32 s66, s59, 8
+; SI-NEXT: s_lshl_b32 s66, s25, 8
; SI-NEXT: s_add_i32 s25, s8, 3
; SI-NEXT: s_or_b32 s66, s66, s21
; SI-NEXT: s_and_b32 s21, s25, 0xff
-; SI-NEXT: s_lshl_b32 s6, s28, 8
-; SI-NEXT: s_add_i32 s29, s19, 3
+; SI-NEXT: s_lshl_b32 s6, s27, 8
+; SI-NEXT: s_add_i32 s29, s29, 3
+; SI-NEXT: v_readlane_b32 s16, v62, 14
; SI-NEXT: s_or_b32 s67, s6, s21
; SI-NEXT: s_and_b32 s6, s29, 0xff
-; SI-NEXT: s_lshl_b32 s18, s26, 8
-; SI-NEXT: s_add_i32 s28, s17, 3
+; SI-NEXT: s_lshl_b32 s18, s24, 8
+; SI-NEXT: s_add_i32 s28, s9, 3
+; SI-NEXT: s_add_i32 s27, s16, 3
+; SI-NEXT: v_readlane_b32 s16, v62, 12
; SI-NEXT: s_or_b32 s68, s18, s6
; SI-NEXT: s_and_b32 s6, s28, 0xff
-; SI-NEXT: s_lshl_b32 s18, s23, 8
+; SI-NEXT: s_lshl_b32 s18, s88, 8
+; SI-NEXT: s_add_i32 s7, s23, 3
+; SI-NEXT: s_lshl_b32 s23, s16, 8
+; SI-NEXT: v_readlane_b32 s16, v62, 13
; SI-NEXT: s_or_b32 s69, s18, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 17
-; SI-NEXT: s_add_i32 s7, s6, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 15
; SI-NEXT: s_and_b32 s6, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v62, 16
-; SI-NEXT: s_add_i32 s27, s16, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 13
-; SI-NEXT: s_lshl_b32 s7, s7, 8
-; SI-NEXT: s_lshl_b32 s23, s16, 8
-; SI-NEXT: v_readlane_b32 s16, v62, 14
-; SI-NEXT: s_mov_b32 s91, s24
-; SI-NEXT: s_or_b32 s70, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 20
+; SI-NEXT: s_lshl_b32 s7, s26, 8
+; SI-NEXT: s_add_i32 s11, s13, 3
; SI-NEXT: s_add_i32 s24, s16, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 11
-; SI-NEXT: s_add_i32 s11, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 18
-; SI-NEXT: s_lshl_b32 s19, s16, 8
-; SI-NEXT: v_readlane_b32 s16, v62, 12
-; SI-NEXT: s_mov_b32 s90, s20
+; SI-NEXT: v_readlane_b32 s16, v62, 10
+; SI-NEXT: s_or_b32 s70, s7, s6
; SI-NEXT: s_and_b32 s6, s11, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
-; SI-NEXT: s_add_i32 s20, s16, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 9
+; SI-NEXT: s_lshl_b32 s7, s19, 8
+; SI-NEXT: s_lshl_b32 s19, s16, 8
+; SI-NEXT: v_readlane_b32 s16, v62, 11
; SI-NEXT: s_or_b32 s71, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 22
+; SI-NEXT: v_readlane_b32 s6, v62, 16
+; SI-NEXT: s_add_i32 s20, s16, 3
+; SI-NEXT: v_readlane_b32 s16, v62, 8
+; SI-NEXT: s_add_i32 s12, s6, 3
+; SI-NEXT: s_lshl_b32 s7, s17, 8
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s17, s16, 8
-; SI-NEXT: v_readlane_b32 s16, v62, 10
-; SI-NEXT: s_add_i32 s12, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 19
+; SI-NEXT: v_readlane_b32 s16, v62, 9
+; SI-NEXT: s_and_b32 s6, s12, 0xff
; SI-NEXT: s_or_b32 s17, s17, s20
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s20, v62, 8
-; SI-NEXT: s_and_b32 s6, s12, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: v_readlane_b32 s20, v62, 7
+; SI-NEXT: s_or_b32 s81, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 17
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s20, s20, 8
-; SI-NEXT: s_or_b32 s81, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 23
+; SI-NEXT: s_add_i32 s14, s6, 3
+; SI-NEXT: v_readlane_b32 s7, v62, 15
; SI-NEXT: s_and_b32 s24, s24, 0xff
; SI-NEXT: s_or_b32 s16, s20, s16
-; SI-NEXT: v_readlane_b32 s20, v62, 7
-; SI-NEXT: s_add_i32 s14, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 21
-; SI-NEXT: s_or_b32 s19, s19, s24
-; SI-NEXT: s_add_i32 s98, s20, 3
-; SI-NEXT: v_readlane_b32 s24, v62, 6
+; SI-NEXT: v_readlane_b32 s20, v62, 6
; SI-NEXT: s_and_b32 s6, s14, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_or_b32 s19, s19, s24
+; SI-NEXT: s_add_i32 s98, s20, 3
+; SI-NEXT: v_readlane_b32 s24, v62, 5
+; SI-NEXT: s_or_b32 s83, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 20
; SI-NEXT: s_and_b32 s20, s98, 0xff
; SI-NEXT: s_lshl_b32 s24, s24, 8
-; SI-NEXT: s_or_b32 s83, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 26
+; SI-NEXT: s_add_i32 s41, s6, 3
+; SI-NEXT: v_readlane_b32 s7, v62, 18
; SI-NEXT: s_and_b32 s27, s27, 0xff
; SI-NEXT: s_or_b32 s20, s24, s20
-; SI-NEXT: v_readlane_b32 s24, v62, 5
-; SI-NEXT: s_add_i32 s41, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 24
-; SI-NEXT: s_or_b32 s23, s23, s27
-; SI-NEXT: s_add_i32 s86, s24, 3
-; SI-NEXT: v_readlane_b32 s27, v62, 4
+; SI-NEXT: v_readlane_b32 s24, v62, 4
; SI-NEXT: s_and_b32 s6, s41, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_or_b32 s23, s23, s27
+; SI-NEXT: s_add_i32 s86, s24, 3
+; SI-NEXT: v_readlane_b32 s27, v62, 3
+; SI-NEXT: s_or_b32 s85, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 22
; SI-NEXT: s_and_b32 s24, s86, 0xff
; SI-NEXT: s_lshl_b32 s27, s27, 8
-; SI-NEXT: s_or_b32 s85, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 28
-; SI-NEXT: s_or_b32 s24, s27, s24
-; SI-NEXT: v_readlane_b32 s27, v62, 3
; SI-NEXT: s_add_i32 s46, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 25
-; SI-NEXT: s_add_i32 s12, s73, 0x300
-; SI-NEXT: s_add_i32 s82, s27, 3
-; SI-NEXT: v_readlane_b32 s73, v62, 2
+; SI-NEXT: v_readlane_b32 s7, v62, 19
+; SI-NEXT: s_or_b32 s24, s27, s24
+; SI-NEXT: v_readlane_b32 s27, v62, 2
; SI-NEXT: s_and_b32 s6, s46, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s12, s73, 0x300
+; SI-NEXT: s_add_i32 s82, s27, 3
+; SI-NEXT: v_readlane_b32 s73, v62, 1
+; SI-NEXT: s_or_b32 s96, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 25
; SI-NEXT: s_and_b32 s27, s82, 0xff
; SI-NEXT: s_lshl_b32 s73, s73, 8
-; SI-NEXT: s_or_b32 s96, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 31
-; SI-NEXT: s_or_b32 s27, s73, s27
-; SI-NEXT: v_readlane_b32 s73, v62, 1
; SI-NEXT: s_add_i32 s47, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 27
-; SI-NEXT: s_add_i32 s13, s74, 0x300
-; SI-NEXT: s_add_i32 s65, s73, 3
-; SI-NEXT: v_readlane_b32 s74, v62, 0
+; SI-NEXT: v_readlane_b32 s7, v62, 21
+; SI-NEXT: s_or_b32 s27, s73, s27
+; SI-NEXT: v_readlane_b32 s73, v62, 0
; SI-NEXT: s_and_b32 s6, s47, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s13, s74, 0x300
+; SI-NEXT: s_add_i32 s65, s73, 3
+; SI-NEXT: v_readlane_b32 s74, v61, 63
+; SI-NEXT: s_or_b32 s97, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 28
; SI-NEXT: s_and_b32 s73, s65, 0xff
; SI-NEXT: s_lshl_b32 s74, s74, 8
-; SI-NEXT: s_or_b32 s97, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 34
-; SI-NEXT: s_or_b32 s73, s74, s73
-; SI-NEXT: v_readlane_b32 s74, v61, 63
; SI-NEXT: s_add_i32 s56, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 29
-; SI-NEXT: s_add_i32 s14, s75, 0x300
-; SI-NEXT: s_add_i32 s54, s74, 3
-; SI-NEXT: v_readlane_b32 s75, v61, 62
+; SI-NEXT: v_readlane_b32 s7, v62, 23
+; SI-NEXT: s_or_b32 s73, s74, s73
+; SI-NEXT: v_readlane_b32 s74, v61, 62
; SI-NEXT: s_and_b32 s6, s56, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s14, s75, 0x300
+; SI-NEXT: s_add_i32 s54, s74, 3
+; SI-NEXT: v_readlane_b32 s75, v61, 61
+; SI-NEXT: s_or_b32 s63, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 30
; SI-NEXT: s_and_b32 s74, s54, 0xff
; SI-NEXT: s_lshl_b32 s75, s75, 8
-; SI-NEXT: s_or_b32 s63, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 36
-; SI-NEXT: s_or_b32 s74, s75, s74
-; SI-NEXT: v_readlane_b32 s75, v61, 61
; SI-NEXT: s_add_i32 s58, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 30
-; SI-NEXT: s_add_i32 s15, s76, 0x300
-; SI-NEXT: s_add_i32 s50, s75, 3
-; SI-NEXT: v_readlane_b32 s76, v61, 60
+; SI-NEXT: v_readlane_b32 s7, v62, 24
+; SI-NEXT: s_or_b32 s74, s75, s74
+; SI-NEXT: v_readlane_b32 s75, v61, 60
; SI-NEXT: s_and_b32 s6, s58, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s15, s76, 0x300
+; SI-NEXT: s_add_i32 s50, s75, 3
+; SI-NEXT: v_readlane_b32 s76, v61, 59
+; SI-NEXT: s_or_b32 s79, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 31
; SI-NEXT: s_and_b32 s75, s50, 0xff
; SI-NEXT: s_lshl_b32 s76, s76, 8
-; SI-NEXT: s_or_b32 s79, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 37
-; SI-NEXT: s_or_b32 s75, s76, s75
-; SI-NEXT: v_readlane_b32 s76, v61, 59
; SI-NEXT: s_add_i32 s59, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 33
-; SI-NEXT: s_add_i32 s18, s77, 0x300
-; SI-NEXT: s_add_i32 s48, s76, 3
-; SI-NEXT: v_readlane_b32 s77, v61, 58
+; SI-NEXT: v_readlane_b32 s7, v62, 27
+; SI-NEXT: s_or_b32 s75, s76, s75
+; SI-NEXT: v_readlane_b32 s76, v61, 58
; SI-NEXT: s_and_b32 s6, s59, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s18, s77, 0x300
+; SI-NEXT: s_add_i32 s48, s76, 3
+; SI-NEXT: v_readlane_b32 s77, v61, 57
+; SI-NEXT: s_or_b32 s78, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 29
; SI-NEXT: s_and_b32 s76, s48, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 8
-; SI-NEXT: s_or_b32 s78, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 35
-; SI-NEXT: s_or_b32 s76, s77, s76
-; SI-NEXT: v_readlane_b32 s77, v61, 57
; SI-NEXT: s_add_i32 s57, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 32
+; SI-NEXT: v_readlane_b32 s7, v62, 26
+; SI-NEXT: s_or_b32 s76, s77, s76
+; SI-NEXT: v_readlane_b32 s77, v61, 56
+; SI-NEXT: s_and_b32 s6, s57, 0xff
+; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_add_i32 s11, s72, 0x300
; SI-NEXT: s_add_i32 s72, s79, 0x300
; SI-NEXT: s_add_i32 s37, s77, 3
-; SI-NEXT: v_readlane_b32 s79, v61, 56
-; SI-NEXT: s_and_b32 s6, s57, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: v_readlane_b32 s79, v61, 55
+; SI-NEXT: s_or_b32 s88, s7, s6
; SI-NEXT: s_and_b32 s77, s37, 0xff
; SI-NEXT: s_lshl_b32 s79, s79, 8
-; SI-NEXT: s_or_b32 s88, s7, s6
-; SI-NEXT: s_or_b32 s77, s79, s77
-; SI-NEXT: v_readlane_b32 s79, v61, 55
+; SI-NEXT: s_add_i32 s35, s35, 3
; SI-NEXT: s_add_i32 s21, s89, 0x300
; SI-NEXT: s_add_i32 s89, s88, 0x300
-; SI-NEXT: s_add_i32 s35, s79, 3
-; SI-NEXT: v_readlane_b32 s88, v61, 54
+; SI-NEXT: s_or_b32 s77, s79, s77
; SI-NEXT: s_and_b32 s79, s35, 0xff
-; SI-NEXT: s_lshl_b32 s88, s88, 8
+; SI-NEXT: s_lshl_b32 s88, s36, 8
; SI-NEXT: s_or_b32 s79, s88, s79
-; SI-NEXT: v_readlane_b32 s88, v61, 53
-; SI-NEXT: s_add_i32 s25, s92, 0x300
+; SI-NEXT: v_readlane_b32 s88, v61, 54
; SI-NEXT: s_add_i32 s30, s88, 3
-; SI-NEXT: v_readlane_b32 s92, v61, 52
+; SI-NEXT: s_add_i32 s25, s92, 0x300
; SI-NEXT: s_and_b32 s88, s30, 0xff
-; SI-NEXT: s_lshl_b32 s92, s92, 8
+; SI-NEXT: s_lshl_b32 s92, s51, 8
+; SI-NEXT: s_add_i32 s94, s94, 3
+; SI-NEXT: v_readlane_b32 s90, v61, 53
; SI-NEXT: s_or_b32 s88, s92, s88
-; SI-NEXT: v_readlane_b32 s92, v61, 51
-; SI-NEXT: s_add_i32 s94, s92, 3
; SI-NEXT: s_and_b32 s92, s94, 0xff
; SI-NEXT: s_lshl_b32 s91, s91, 8
; SI-NEXT: s_add_i32 s90, s90, 3
; SI-NEXT: s_or_b32 s91, s91, s92
; SI-NEXT: s_and_b32 s90, s90, 0xff
-; SI-NEXT: s_lshl_b32 s92, s31, 8
+; SI-NEXT: s_lshl_b32 s92, s80, 8
; SI-NEXT: s_or_b32 s90, s92, s90
-; SI-NEXT: v_readlane_b32 s92, v61, 50
+; SI-NEXT: v_readlane_b32 s92, v61, 52
; SI-NEXT: s_add_i32 s92, s92, 3
; SI-NEXT: s_add_i32 s26, s93, 0x300
; SI-NEXT: s_and_b32 s92, s92, 0xff
-; SI-NEXT: s_lshl_b32 s93, s95, 8
+; SI-NEXT: s_lshl_b32 s93, s55, 8
; SI-NEXT: s_or_b32 s92, s93, s92
-; SI-NEXT: v_readlane_b32 s93, v61, 49
+; SI-NEXT: v_readlane_b32 s93, v61, 51
; SI-NEXT: s_add_i32 s93, s93, 3
; SI-NEXT: s_and_b32 s93, s93, 0xff
-; SI-NEXT: s_lshl_b32 s94, s55, 8
+; SI-NEXT: s_lshl_b32 s94, s95, 8
; SI-NEXT: s_or_b32 s93, s94, s93
-; SI-NEXT: v_readlane_b32 s94, v61, 48
+; SI-NEXT: v_readlane_b32 s94, v61, 49
; SI-NEXT: s_add_i32 s94, s94, 3
; SI-NEXT: s_and_b32 s94, s94, 0xff
-; SI-NEXT: s_lshl_b32 s95, s99, 8
+; SI-NEXT: s_lshl_b32 s95, s31, 8
; SI-NEXT: s_or_b32 s94, s95, s94
-; SI-NEXT: v_readlane_b32 s95, v61, 1
+; SI-NEXT: v_readlane_b32 s95, v61, 50
; SI-NEXT: s_add_i32 s95, s95, 3
-; SI-NEXT: v_readlane_b32 s30, v61, 0
+; SI-NEXT: v_readlane_b32 s30, v61, 48
; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300
; SI-NEXT: s_and_b32 s95, s95, 0xff
; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8
; SI-NEXT: v_readlane_b32 s30, v61, 47
; SI-NEXT: s_or_b32 s95, vcc_lo, s95
; SI-NEXT: s_add_i32 vcc_lo, s30, 3
-; SI-NEXT: v_readlane_b32 s30, v61, 2
+; SI-NEXT: v_readlane_b32 s30, v61, 46
; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300
; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff
; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8
-; SI-NEXT: v_readlane_b32 s30, v61, 46
+; SI-NEXT: v_readlane_b32 s30, v61, 45
; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo
; SI-NEXT: s_add_i32 vcc_hi, s30, 3
-; SI-NEXT: v_readlane_b32 s30, v61, 45
+; SI-NEXT: v_readlane_b32 s30, v61, 44
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s30, s30, 8
; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi
-; SI-NEXT: v_readlane_b32 s30, v61, 44
+; SI-NEXT: v_readlane_b32 s30, v61, 43
; SI-NEXT: s_add_i32 s30, s30, 3
-; SI-NEXT: v_readlane_b32 s31, v61, 43
+; SI-NEXT: v_readlane_b32 s31, v61, 42
; SI-NEXT: s_and_b32 s30, s30, 0xff
; SI-NEXT: s_lshl_b32 s31, s31, 8
; SI-NEXT: s_or_b32 s30, s31, s30
-; SI-NEXT: v_readlane_b32 s31, v61, 42
+; SI-NEXT: v_readlane_b32 s31, v61, 41
; SI-NEXT: s_add_i32 s29, s34, 0x300
; SI-NEXT: s_add_i32 s31, s31, 3
-; SI-NEXT: v_readlane_b32 s34, v61, 41
+; SI-NEXT: v_readlane_b32 s34, v61, 40
; SI-NEXT: s_and_b32 s31, s31, 0xff
; SI-NEXT: s_lshl_b32 s34, s34, 8
; SI-NEXT: s_or_b32 s31, s34, s31
@@ -186282,25 +186452,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v1, s31
; SI-NEXT: s_addk_i32 s30, 0x300
; SI-NEXT: s_addk_i32 vcc_hi, 0x300
-; SI-NEXT: v_readlane_b32 s34, v61, 40
+; SI-NEXT: v_readlane_b32 s34, v61, 39
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s30
; SI-NEXT: s_add_i32 s34, s34, 3
-; SI-NEXT: v_readlane_b32 s35, v61, 39
+; SI-NEXT: v_readlane_b32 s35, v61, 38
; SI-NEXT: s_and_b32 s34, s34, 0xff
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi
; SI-NEXT: s_lshl_b32 s35, s35, 8
-; SI-NEXT: s_addk_i32 vcc_lo, 0x300
+; SI-NEXT: s_addk_i32 s95, 0x300
; SI-NEXT: s_or_b32 s34, s35, s34
-; SI-NEXT: v_readlane_b32 s35, v61, 38
+; SI-NEXT: v_readlane_b32 s35, v61, 37
; SI-NEXT: s_add_i32 s35, s35, 3
-; SI-NEXT: v_readlane_b32 s36, v61, 37
+; SI-NEXT: v_readlane_b32 s36, v61, 36
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s95
; SI-NEXT: s_and_b32 s35, s35, 0xff
; SI-NEXT: s_lshl_b32 s36, s36, 8
; SI-NEXT: s_or_b32 s35, s36, s35
@@ -186347,19 +186517,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_addk_i32 s92, 0x300
; SI-NEXT: s_addk_i32 s93, 0x300
; SI-NEXT: s_addk_i32 s94, 0x300
-; SI-NEXT: s_addk_i32 s95, 0x300
+; SI-NEXT: s_addk_i32 vcc_lo, 0x300
; SI-NEXT: s_addk_i32 s34, 0x300
; SI-NEXT: s_addk_i32 s35, 0x300
; SI-NEXT: v_cvt_f32_f16_e32 v6, s35
; SI-NEXT: v_cvt_f32_f16_e32 v5, s34
+; SI-NEXT: v_cvt_f32_f16_e32 v7, vcc_lo
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s95
; SI-NEXT: v_cvt_f32_f16_e32 v10, s94
; SI-NEXT: v_cvt_f32_f16_e32 v8, s93
-; SI-NEXT: v_cvt_f32_f16_e32 v11, s92
+; SI-NEXT: v_cvt_f32_f16_e32 v12, s92
; SI-NEXT: v_cvt_f32_f16_e32 v9, s90
; SI-NEXT: v_cvt_f32_f16_e32 v13, s91
-; SI-NEXT: v_cvt_f32_f16_e32 v12, s88
+; SI-NEXT: v_cvt_f32_f16_e32 v11, s88
; SI-NEXT: v_cvt_f32_f16_e32 v15, s79
; SI-NEXT: v_cvt_f32_f16_e32 v14, s77
; SI-NEXT: v_cvt_f32_f16_e32 v17, s76
@@ -186412,7 +186582,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: .LBB93_3: ; %end
+; SI-NEXT: .LBB93_5: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -186467,26 +186637,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v12
; SI-NEXT: v_cvt_f16_f32_e32 v6, v8
; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
@@ -186501,7 +186671,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v11
; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
@@ -186699,134 +186869,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB93_4:
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: s_mov_b32 s17, s19
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: s_mov_b32 s19, s50
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_mov_b32 s23, s48
-; SI-NEXT: s_mov_b32 s26, s37
-; SI-NEXT: s_mov_b32 s28, s29
-; SI-NEXT: s_mov_b32 s29, s76
-; SI-NEXT: s_mov_b32 s59, s58
-; SI-NEXT: s_mov_b32 s56, s47
-; SI-NEXT: s_mov_b32 s46, s41
-; SI-NEXT: s_mov_b32 s12, s11
-; SI-NEXT: s_mov_b32 s11, s7
-; SI-NEXT: s_mov_b32 s7, s97
-; SI-NEXT: s_mov_b32 s97, s81
-; SI-NEXT: s_mov_b32 s81, s85
-; SI-NEXT: s_mov_b32 s6, s40
-; SI-NEXT: s_mov_b32 s40, s72
-; SI-NEXT: s_mov_b32 s45, s73
-; SI-NEXT: s_mov_b32 s15, s89
-; SI-NEXT: s_mov_b32 s24, s98
-; SI-NEXT: s_mov_b32 s20, s88
-; SI-NEXT: s_mov_b32 s99, s55
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_readlane_b32 s75, v61, 7
-; SI-NEXT: v_readlane_b32 s76, v61, 3
-; SI-NEXT: v_readlane_b32 s77, v61, 5
-; SI-NEXT: v_readlane_b32 s78, v61, 4
-; SI-NEXT: v_readlane_b32 s92, v61, 6
-; SI-NEXT: v_readlane_b32 s39, v61, 9
-; SI-NEXT: v_readlane_b32 s37, v61, 8
-; SI-NEXT: v_readlane_b32 s30, v61, 10
-; SI-NEXT: v_readlane_b32 s48, v61, 11
-; SI-NEXT: v_readlane_b32 s52, v61, 13
-; SI-NEXT: v_readlane_b32 s35, v61, 12
-; SI-NEXT: v_readlane_b32 s50, v61, 15
-; SI-NEXT: v_readlane_b32 s64, v61, 14
-; SI-NEXT: v_readlane_b32 s54, v61, 17
-; SI-NEXT: v_readlane_b32 s67, v61, 16
-; SI-NEXT: v_readlane_b32 s65, v61, 18
-; SI-NEXT: v_readlane_b32 s70, v61, 19
-; SI-NEXT: v_readlane_b32 s49, v61, 21
-; SI-NEXT: v_readlane_b32 s71, v61, 20
-; SI-NEXT: v_readlane_b32 s80, v61, 23
-; SI-NEXT: v_readlane_b32 s83, v61, 22
-; SI-NEXT: v_readlane_b32 s84, v61, 25
-; SI-NEXT: v_readlane_b32 s82, v61, 24
-; SI-NEXT: v_readlane_b32 s87, v61, 26
-; SI-NEXT: v_readlane_b32 s86, v61, 27
-; SI-NEXT: v_readlane_b32 s96, v61, 29
-; SI-NEXT: v_readlane_b32 s51, v61, 28
-; SI-NEXT: s_mov_b32 s55, s93
-; SI-NEXT: s_mov_b32 s95, s91
-; SI-NEXT: v_readlane_b32 s94, v61, 31
-; SI-NEXT: s_mov_b32 s31, s90
-; SI-NEXT: v_readlane_b32 s34, v61, 30
-; SI-NEXT: v_readlane_b32 s53, v61, 32
-; SI-NEXT: v_readlane_b32 s66, v61, 33
-; SI-NEXT: v_readlane_b32 s68, v61, 34
-; SI-NEXT: v_readlane_b32 s69, v61, 35
-; SI-NEXT: v_readlane_b32 s8, v61, 36
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: s_branch .LBB93_2
;
; VI-LABEL: bitcast_v128i8_to_v64f16_scalar:
; VI: ; %bb.0:
@@ -186888,13 +186930,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v27
; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
@@ -186906,46 +186949,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200
@@ -186954,34 +186993,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v28
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -187000,6 +187042,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328
@@ -187008,12 +187055,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36
-; VI-NEXT: s_waitcnt vmcnt(11)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
-; VI-NEXT: s_waitcnt vmcnt(10)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44
@@ -187022,47 +187065,45 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116
; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:172
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:196
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
@@ -187072,46 +187113,50 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB93_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -187128,11 +187173,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -187156,6 +187200,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v17, v10
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
@@ -187172,38 +187217,43 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v40, v42
+; VI-NEXT: v_mov_b32_e32 v42, v44
+; VI-NEXT: v_mov_b32_e32 v44, v45
+; VI-NEXT: v_mov_b32_e32 v45, v62
+; VI-NEXT: v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v24
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -187211,77 +187261,74 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v45, v62
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v32, v1
; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v22
-; VI-NEXT: v_mov_b32_e32 v41, v24
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v34, v0
+; VI-NEXT: v_mov_b32_e32 v33, v0
; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v37, v1
-; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v55, v26
+; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v50, v26
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
-; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v43, v27
+; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v51, v0
-; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v35, v1
-; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v53, v28
+; VI-NEXT: v_mov_b32_e32 v53, v1
+; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v52, v28
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v36, v0
+; VI-NEXT: v_mov_b32_e32 v55, v0
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v35, v0
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v41, v1
+; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v63, v27
+; VI-NEXT: v_mov_b32_e32 v46, v57
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v56, v0
+; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v58, v1
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v61, v60
-; VI-NEXT: v_mov_b32_e32 v60, v59
+; VI-NEXT: v_mov_b32_e32 v56, v1
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v61, v59
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -187293,55 +187340,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v50, v0
+; VI-NEXT: v_mov_b32_e32 v58, v0
; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v52, v0
-; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v46, v1
-; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v60, v1
+; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v63, v0
-; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v54, v0
+; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v47, v1
-; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v57, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
@@ -187373,12 +187418,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_cbranch_execnz .LBB93_3
; VI-NEXT: .LBB93_2: ; %cmp.true
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59
-; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -187397,165 +187440,147 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: s_lshl_b32 s9, s19, 8
; VI-NEXT: s_add_i32 s16, s16, 3
; VI-NEXT: s_lshl_b32 s10, s17, 8
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59
+; VI-NEXT: v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62
-; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44
-; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45
-; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44
+; VI-NEXT: v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42
-; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40
-; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60
-; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61
-; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46
+; VI-NEXT: v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48
; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38
; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50
+; VI-NEXT: v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v36
; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49
; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57
-; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v58
+; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34
; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14
@@ -187564,67 +187589,78 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36
-; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52
-; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54
-; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v35
+; VI-NEXT: v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v25
+; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v59
+; VI-NEXT: v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13
-; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT: v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51
; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59
-; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1
+; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v63
+; VI-NEXT: v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12
-; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25
+; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -187648,15 +187684,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10
; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55
; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53
-; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v52
+; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v53
+; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9
; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10
+; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27
-; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28
-; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -187672,18 +187707,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42
; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40
-; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1
-; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11
-; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
@@ -187723,19 +187754,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46
; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2
+; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4
; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4
@@ -187802,35 +187843,38 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB93_4:
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v61, v60
-; VI-NEXT: v_mov_b32_e32 v60, v59
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v61, v59
+; VI-NEXT: v_mov_b32_e32 v46, v57
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v40, v42
+; VI-NEXT: v_mov_b32_e32 v42, v44
+; VI-NEXT: v_mov_b32_e32 v44, v45
; VI-NEXT: v_mov_b32_e32 v45, v62
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v57, v5
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v47, v4
-; VI-NEXT: v_mov_b32_e32 v63, v3
-; VI-NEXT: v_mov_b32_e32 v53, v28
-; VI-NEXT: v_mov_b32_e32 v43, v27
-; VI-NEXT: v_mov_b32_e32 v55, v26
-; VI-NEXT: v_mov_b32_e32 v41, v24
-; VI-NEXT: v_mov_b32_e32 v54, v22
+; VI-NEXT: v_mov_b32_e32 v54, v3
+; VI-NEXT: v_mov_b32_e32 v52, v28
+; VI-NEXT: v_mov_b32_e32 v63, v27
+; VI-NEXT: v_mov_b32_e32 v50, v26
+; VI-NEXT: v_mov_b32_e32 v34, v24
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_branch .LBB93_2
@@ -187892,18 +187936,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43
; GFX9-NEXT: s_waitcnt vmcnt(23)
@@ -187932,10 +187976,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
@@ -187947,7 +187991,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
@@ -187995,7 +188039,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
@@ -188022,23 +188066,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
@@ -188051,48 +188095,49 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100
; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:116
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:140
; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:156
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164
-; GFX9-NEXT: s_waitcnt vmcnt(21)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: s_waitcnt vmcnt(22)
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276
; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:316
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
@@ -188103,55 +188148,54 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(36)
-; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: s_waitcnt vmcnt(38)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
@@ -188161,7 +188205,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB93_2
@@ -188174,7 +188218,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -188211,10 +188255,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
@@ -188230,13 +188274,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188244,7 +188288,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -188285,8 +188329,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v52, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v50, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188304,16 +188348,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v48, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_mov_b32_e32 v33, v45
+; GFX9-NEXT: v_mov_b32_e32 v33, v46
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -188326,7 +188370,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188335,7 +188379,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188343,121 +188387,122 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v34, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_mov_b32_e32 v46, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v35, v45
-; GFX9-NEXT: v_mov_b32_e32 v45, v61
-; GFX9-NEXT: v_mov_b32_e32 v61, v42
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v38, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v54, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v41, v57
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v57, v59
; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v56, v42
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB93_3
; GFX9-NEXT: .LBB93_2:
; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v33, v45
-; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v33, v46
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v56, v61
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: .LBB93_3: ; %Flow
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -188660,7 +188705,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v1, 3, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -188720,11 +188765,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
@@ -188759,7 +188804,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; GFX9-NEXT: v_add_u32_e32 v24, 3, v24
-; GFX9-NEXT: v_add_u32_e32 v26, 3, v61
+; GFX9-NEXT: v_add_u32_e32 v26, 3, v62
; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24
; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48
@@ -188768,7 +188813,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v26, 3, v45
+; GFX9-NEXT: v_add_u32_e32 v26, 3, v61
; GFX9-NEXT: v_add_u32_e32 v20, 3, v20
; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20
@@ -188777,7 +188822,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: v_add_u32_e32 v26, 3, v56
; GFX9-NEXT: v_add_u32_e32 v21, 3, v21
-; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21
; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21
@@ -190633,212 +190678,207 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v4
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v8
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v46, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v31, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v28
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v20
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v28
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v24
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v15
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v17
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v11, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v37
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v45, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v50
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v39
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v51
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v22
; SI-NEXT: v_cvt_f16_f32_e32 v56, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v55
; SI-NEXT: v_cvt_f16_f32_e32 v47, v54
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v10, v44
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v59, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v62
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v59, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v44
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v61
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v45
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; kill: killed $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v25
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v27
; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v30
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v29
; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v37
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
-; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v63
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v46
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v40
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v63
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v3
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v5
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v15
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v6
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v14
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v15
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v16
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v16
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v6
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v46, v14
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; kill: killed $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; kill: killed $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; kill: killed $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; kill: killed $vgpr14
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v14
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
@@ -190962,20 +191002,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; kill: killed $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; kill: killed $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT: ; kill: killed $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; kill: killed $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; kill: killed $vgpr15
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
@@ -190985,407 +191035,400 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; kill: killed $vgpr15
; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: v_mov_b32_e32 v45, v46
-; SI-NEXT: v_mov_b32_e32 v46, v6
-; SI-NEXT: v_mov_b32_e32 v6, v5
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr14
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; kill: killed $vgpr15
; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB94_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v16, v46
+; SI-NEXT: v_mov_b32_e32 v42, v40
+; SI-NEXT: v_mov_b32_e32 v40, v55
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v20, v1, v2
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_or_b32_e32 v44, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46
+; SI-NEXT: v_mov_b32_e32 v46, v33
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v41, v15, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v44, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v54, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33
+; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v32, v31
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v41, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v54, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v53, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v53, v15, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v51, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v51, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31
+; SI-NEXT: v_mov_b32_e32 v31, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v52, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v49, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v50, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v48, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v39, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v52, v15, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v37, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v49, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; SI-NEXT: v_mov_b32_e32 v13, v12
+; SI-NEXT: v_mov_b32_e32 v12, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v50, v15, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v38, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v35, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v48, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v36, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v39, v15, v14
+; SI-NEXT: v_alignbit_b32 v14, v41, v44, 24
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v30, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10
+; SI-NEXT: v_or_b32_e32 v37, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v34, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v38, v15, v14
+; SI-NEXT: v_alignbit_b32 v14, v41, v44, 16
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v28, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; SI-NEXT: v_or_b32_e32 v35, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v29, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v36, v15, v14
+; SI-NEXT: v_alignbit_b32 v14, v41, v44, 8
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v30, v15, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v26, v5, v14
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7
+; SI-NEXT: v_or_b32_e32 v34, v15, v14
+; SI-NEXT: v_alignbit_b32 v14, v53, v54, 24
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v27, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v28, v14, v9
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v29, v14, v9
+; SI-NEXT: v_alignbit_b32 v9, v53, v54, 16
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v24, v47, v14
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v26, v43, v9
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_or_b32_e32 v25, v5, v14
-; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v27, v14, v9
+; SI-NEXT: v_alignbit_b32 v9, v53, v54, 8
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59
-; SI-NEXT: v_or_b32_e32 v22, v58, v5
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56
-; SI-NEXT: v_or_b32_e32 v23, v57, v5
-; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v56
+; SI-NEXT: v_or_b32_e32 v24, v47, v9
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43
+; SI-NEXT: v_or_b32_e32 v25, v10, v9
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v9, v52, v51, 24
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62
-; SI-NEXT: v_or_b32_e32 v20, v61, v5
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47
-; SI-NEXT: v_or_b32_e32 v21, v60, v5
-; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63
-; SI-NEXT: v_or_b32_e32 v18, v40, v5
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59
+; SI-NEXT: v_or_b32_e32 v22, v58, v9
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; SI-NEXT: v_or_b32_e32 v23, v57, v9
+; SI-NEXT: v_alignbit_b32 v9, v52, v51, 16
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40
-; SI-NEXT: v_or_b32_e32 v19, v55, v5
-; SI-NEXT: v_alignbit_b32 v5, v50, v49, 24
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_or_b32_e32 v21, v60, v1
+; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43
-; SI-NEXT: v_or_b32_e32 v16, v1, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46
-; SI-NEXT: v_or_b32_e32 v17, v42, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61
+; SI-NEXT: v_or_b32_e32 v18, v4, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT: v_or_b32_e32 v19, v3, v1
+; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_or_b32_e32 v14, v63, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42
+; SI-NEXT: v_or_b32_e32 v17, v62, v1
; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v14, v4, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45
-; SI-NEXT: v_or_b32_e32 v15, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_or_b32_e32 v3, v8, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40
+; SI-NEXT: v_or_b32_e32 v15, v6, v1
; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v17, v14, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v17, v14, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v17, v14, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16
+; SI-NEXT: v_alignbit_b32 v1, v15, v3, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8
+; SI-NEXT: v_alignbit_b32 v1, v15, v3, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v15, v3, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v33, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v16, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_bfe_u32 v1, v46, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v32, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v31, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_bfe_u32 v1, v11, 8, 8
+; SI-NEXT: v_mov_b32_e32 v11, v12
+; SI-NEXT: v_mov_b32_e32 v12, v13
+; SI-NEXT: v_mov_b32_e32 v13, v31
+; SI-NEXT: v_mov_b32_e32 v31, v32
+; SI-NEXT: v_mov_b32_e32 v32, v33
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v12, 8, 8
+; SI-NEXT: v_bfe_u32 v1, v32, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v9, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v13, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v12, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v11, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v10, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v55, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v45, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v7, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v43, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v6, 8, 8
+; SI-NEXT: v_bfe_u32 v1, v10, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v56, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v9, 8, 8
+; SI-NEXT: v_mov_b32_e32 v55, v40
+; SI-NEXT: v_mov_b32_e32 v40, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v47, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v2, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v40, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v46, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v45, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -191439,202 +191482,209 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: v_mov_b32_e32 v33, v46
+; SI-NEXT: v_mov_b32_e32 v46, v16
+; SI-NEXT: v_mov_b32_e32 v16, v3
+; SI-NEXT: v_bfe_u32 v42, v55, 8, 8
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: .LBB94_2: ; %Flow
-; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
+; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB94_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v16, v8, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v55
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55
+; SI-NEXT: v_or_b32_e32 v15, v6, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v63
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v14, v4, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v45
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_or_b32_e32 v14, v6, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v62
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v47
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v45, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45
-; SI-NEXT: v_or_b32_e32 v15, v2, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v43
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v16, v1, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v42
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v46, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46
-; SI-NEXT: v_or_b32_e32 v17, v1, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v40
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v18, v2, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v55
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_or_b32_e32 v20, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_or_b32_e32 v17, v5, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v60
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v18, v4, v5
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v1
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT: v_or_b32_e32 v19, v1, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v20, v2, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v60
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42
; SI-NEXT: v_or_b32_e32 v21, v1, v2
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v4
; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
; SI-NEXT: v_cvt_f32_f16_e32 v2, v58
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT: v_or_b32_e32 v19, v3, v4
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v47
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v22, v2, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; SI-NEXT: v_or_b32_e32 v23, v1, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v6
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_or_b32_e32 v24, v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v25, v2, v3
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v7
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v45
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v2
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT: v_or_b32_e32 v25, v2, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v43
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v26, v2, v1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v9
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_or_b32_e32 v28, v4, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_or_b32_e32 v27, v3, v1
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_or_b32_e32 v27, v3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v29, v1, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
; SI-NEXT: v_or_b32_e32 v30, v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v11, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v34, v2, v3
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v11
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v12
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -191647,24 +191697,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v35, v2, v1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v37, v4, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_or_b32_e32 v36, v3, v1
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v13
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v32
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -191674,34 +191724,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32
; SI-NEXT: v_or_b32_e32 v38, v1, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v9
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_or_b32_e32 v48, v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v10, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v39, v2, v3
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v12
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v39, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v13
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
@@ -191712,21 +191764,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v49, v2, v1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v13
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v51, v4, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_or_b32_e32 v50, v3, v1
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v1, v31
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v31, v1
@@ -191742,30 +191794,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31
; SI-NEXT: v_or_b32_e32 v52, v1, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v33
; SI-NEXT: v_or_b32_e32 v54, v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v53, v2, v3
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v46
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v46, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
@@ -191776,274 +191828,279 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v44, v2, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46
; SI-NEXT: v_or_b32_e32 v41, v3, v1
; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v41, v44, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v17, v14, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v17, v14, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v17, v14, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16
+; SI-NEXT: v_alignbit_b32 v1, v15, v16, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8
+; SI-NEXT: v_alignbit_b32 v1, v15, v16, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v15, v16, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v33, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v46, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v32, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v33, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v31, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_bfe_u32 v1, v13, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_bfe_u32 v1, v10, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v12, 8, 8
+; SI-NEXT: v_bfe_u32 v1, v32, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v9, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v13, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v12, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v11, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v10, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_bfe_u32 v1, v9, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_bfe_u32 v1, v7, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_bfe_u32 v1, v6, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_bfe_u32 v1, v42, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_bfe_u32 v1, v55, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_bfe_u32 v1, v40, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v46, 8, 8
-; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v5, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v45, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v1, v40, 8, 8
+; SI-NEXT: v_bfe_u32 v42, v55, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: .LBB94_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v41
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v46
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192052,14 +192109,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v54
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192070,14 +192127,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v53
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v32
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192086,14 +192143,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v51
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192104,8 +192161,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v52
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -192120,14 +192177,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v49
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192138,14 +192195,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v50
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v12
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v13
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192154,14 +192211,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v48
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192172,30 +192229,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v39
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v37
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192206,14 +192265,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v38
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v13
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v32
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192222,14 +192281,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v35
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192240,14 +192299,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v36
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v11
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v12
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192256,14 +192315,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v30
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192274,14 +192333,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v34
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v10
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192290,14 +192349,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v28
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192308,30 +192367,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v29
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v26
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192342,33 +192403,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v27
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v24
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -192376,30 +192439,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v25
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v22
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192410,14 +192475,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v23
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192428,14 +192493,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v20
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192446,14 +192511,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v21
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192464,14 +192529,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192482,14 +192547,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v19
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192500,14 +192565,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192518,14 +192583,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v17
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v46
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v40
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
@@ -192534,14 +192599,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v16
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -192552,18 +192617,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v45
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v55
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -192607,54 +192670,56 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v10
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v30
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12
+; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v11
+; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v26
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v24
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18
-; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v21
+; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v20
+; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; VI-NEXT: s_waitcnt vmcnt(12)
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v62
+; VI-NEXT: s_waitcnt vmcnt(11)
+; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v61
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
@@ -192707,6 +192772,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
@@ -192750,31 +192816,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: s_waitcnt vmcnt(12)
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61
-; VI-NEXT: s_waitcnt vmcnt(11)
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr32
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr32
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr52
; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr32
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -192789,770 +192854,773 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: s_cbranch_execz .LBB94_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v61
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v62
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v62
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v61
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v60
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v26
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v24
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v24
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v23
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v22
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v21
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v20
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v20
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v19
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v17
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16]
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v31, v33
-; VI-NEXT: v_mov_b32_e32 v33, v43
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v31, v43
; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[5:6]
-; VI-NEXT: v_mov_b32_e32 v43, v33
-; VI-NEXT: v_mov_b32_e32 v33, v46
-; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4]
-; VI-NEXT: v_mov_b32_e32 v46, v33
-; VI-NEXT: v_mov_b32_e32 v33, v53
-; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v53, v33
-; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[60:61]
+; VI-NEXT: v_mov_b32_e32 v32, v57
+; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[3:4]
+; VI-NEXT: v_mov_b32_e32 v43, v31
+; VI-NEXT: v_mov_b32_e32 v57, v32
+; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[1:2]
+; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[61:62]
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[29:30]
-; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28]
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, v36
+; VI-NEXT: v_lshrrev_b64 v[59:60], 24, v[27:28]
+; VI-NEXT: v_mov_b32_e32 v32, v35
; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26]
-; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
-; VI-NEXT: v_mov_b32_e32 v36, v33
-; VI-NEXT: v_mov_b32_e32 v33, v41
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
-; VI-NEXT: v_mov_b32_e32 v34, v51
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v35, v32
+; VI-NEXT: v_mov_b32_e32 v32, v41
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24]
+; VI-NEXT: v_mov_b32_e32 v41, v32
+; VI-NEXT: v_mov_b32_e32 v32, v53
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v18
+; VI-NEXT: v_mov_b32_e32 v53, v32
+; VI-NEXT: v_mov_b32_e32 v32, v34
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
+; VI-NEXT: v_mov_b32_e32 v59, v50
+; VI-NEXT: v_mov_b32_e32 v34, v32
+; VI-NEXT: v_mov_b32_e32 v32, v51
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT: v_mov_b32_e32 v41, v33
-; VI-NEXT: v_mov_b32_e32 v33, v31
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20]
-; VI-NEXT: v_mov_b32_e32 v51, v34
+; VI-NEXT: v_mov_b32_e32 v51, v32
; VI-NEXT: .LBB94_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB94_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_mov_b32_e32 v63, 0x200
-; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v55, 0x200
+; VI-NEXT: v_add_f16_sdwa v31, v18, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v18, 0x200, v18
; VI-NEXT: v_or_b32_e32 v32, v18, v31
-; VI-NEXT: v_add_f16_sdwa v31, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v17, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v17, 0x200, v17
; VI-NEXT: v_or_b32_e32 v31, v17, v31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_add_f16_sdwa v31, v20, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v20, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v20, 0x200, v20
; VI-NEXT: v_or_b32_e32 v32, v20, v31
-; VI-NEXT: v_add_f16_sdwa v31, v19, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v19, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v19, 0x200, v19
; VI-NEXT: v_or_b32_e32 v31, v19, v31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_add_f16_sdwa v34, v22, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v34
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v22, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v22, 0x200, v22
; VI-NEXT: v_or_b32_e32 v32, v22, v31
-; VI-NEXT: v_add_f16_sdwa v31, v21, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v21, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v21, 0x200, v21
; VI-NEXT: v_or_b32_e32 v31, v21, v31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_add_f16_sdwa v31, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v24, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v24, 0x200, v24
; VI-NEXT: v_or_b32_e32 v32, v24, v31
-; VI-NEXT: v_add_f16_sdwa v31, v23, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v23, 0x200, v23
; VI-NEXT: v_or_b32_e32 v31, v23, v31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_add_f16_sdwa v31, v26, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v26, 0x200, v26
-; VI-NEXT: v_or_b32_e32 v36, v26, v31
-; VI-NEXT: v_add_f16_sdwa v31, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v32, v26, v31
+; VI-NEXT: v_add_f16_sdwa v31, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT: v_or_b32_e32 v35, v25, v31
-; VI-NEXT: v_add_f16_sdwa v31, v28, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v31, v25, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v28, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v28, 0x200, v28
; VI-NEXT: v_or_b32_e32 v38, v28, v31
-; VI-NEXT: v_add_f16_sdwa v31, v27, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v27, 0x200, v27
; VI-NEXT: v_or_b32_e32 v37, v27, v31
-; VI-NEXT: v_add_f16_sdwa v31, v30, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v31, v30, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v30, 0x200, v30
-; VI-NEXT: v_add_f16_sdwa v32, v29, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v32, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v29, 0x200, v29
-; VI-NEXT: v_or_b32_e32 v49, v30, v31
+; VI-NEXT: v_or_b32_e32 v34, v30, v31
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT: v_or_b32_e32 v48, v29, v31
-; VI-NEXT: v_add_f16_sdwa v31, v61, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v33, v29, v31
+; VI-NEXT: v_add_f16_sdwa v31, v62, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; VI-NEXT: v_add_f16_e32 v61, 0x200, v61
-; VI-NEXT: v_add_f16_sdwa v32, v60, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v62, 0x200, v62
+; VI-NEXT: v_add_f16_sdwa v32, v61, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_add_f16_e32 v60, 0x200, v60
-; VI-NEXT: v_or_b32_e32 v51, v61, v31
+; VI-NEXT: v_add_f16_e32 v61, 0x200, v61
+; VI-NEXT: v_or_b32_e32 v51, v62, v31
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT: v_or_b32_e32 v50, v60, v31
-; VI-NEXT: v_add_f16_sdwa v31, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v47, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v50, v61, v31
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT: v_add_f16_sdwa v32, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; VI-NEXT: v_add_f16_sdwa v32, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v47
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
; VI-NEXT: v_or_b32_e32 v53, v2, v31
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
; VI-NEXT: v_or_b32_e32 v52, v1, v31
-; VI-NEXT: v_add_f16_sdwa v31, v4, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v31, v4, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; VI-NEXT: v_add_f16_e32 v4, 0x200, v4
-; VI-NEXT: v_add_f16_sdwa v32, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v32, v3, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
; VI-NEXT: v_or_b32_e32 v46, v4, v31
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
+; VI-NEXT: v_add_f16_sdwa v36, v6, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v45, v3, v31
-; VI-NEXT: v_add_f16_sdwa v31, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
-; VI-NEXT: v_add_f16_sdwa v32, v5, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; VI-NEXT: v_add_f16_sdwa v32, v5, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v36
; VI-NEXT: v_add_f16_e32 v5, 0x200, v5
; VI-NEXT: v_or_b32_e32 v43, v6, v31
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT: v_add_f16_sdwa v44, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_sdwa v44, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v42, v5, v31
; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT: v_add_f16_sdwa v32, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v32, v7, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v44
; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_or_b32_e32 v41, v8, v31
; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
+; VI-NEXT: v_add_f16_sdwa v49, v10, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v40, v7, v31
-; VI-NEXT: v_add_f16_sdwa v31, v10, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; VI-NEXT: v_add_f16_e32 v10, 0x200, v10
-; VI-NEXT: v_add_f16_sdwa v32, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
-; VI-NEXT: v_or_b32_e32 v55, v10, v31
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT: v_add_f16_sdwa v39, v12, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v54, v9, v31
+; VI-NEXT: v_add_f16_sdwa v35, v9, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v49
+; VI-NEXT: v_add_f16_sdwa v39, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v32, v10, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v35
; VI-NEXT: v_add_f16_e32 v12, 0x200, v12
-; VI-NEXT: v_add_f16_sdwa v33, v11, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v39
-; VI-NEXT: v_add_f16_sdwa v47, v14, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v32, v12, v31
-; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v33
+; VI-NEXT: v_add_f16_sdwa v35, v11, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v39
+; VI-NEXT: v_add_f16_e32 v11, 0x200, v11
+; VI-NEXT: v_or_b32_e32 v57, v12, v54
+; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v35
+; VI-NEXT: v_add_f16_sdwa v63, v14, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v56, v11, v54
; VI-NEXT: v_add_f16_e32 v14, 0x200, v14
-; VI-NEXT: v_add_f16_sdwa v33, v13, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v47
-; VI-NEXT: v_or_b32_e32 v57, v14, v56
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v33
-; VI-NEXT: v_add_f16_sdwa v33, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v48, v13, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v63
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f16_e32 v13, 0x200, v13
+; VI-NEXT: v_or_b32_e32 v59, v14, v54
+; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v48
+; VI-NEXT: v_add_f16_sdwa v48, v16, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v58, v13, v54
; VI-NEXT: v_add_f16_e32 v16, 0x200, v16
-; VI-NEXT: v_add_f16_sdwa v63, v15, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v33
+; VI-NEXT: v_add_f16_sdwa v60, v15, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v48
; VI-NEXT: v_add_f16_e32 v15, 0x200, v15
-; VI-NEXT: v_or_b32_e32 v59, v16, v58
-; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v63
-; VI-NEXT: v_or_b32_e32 v58, v15, v58
-; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v59
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v58
+; VI-NEXT: v_or_b32_e32 v55, v16, v54
+; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v60
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v54, v15, v54
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v55
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v54
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[54:55]
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v59
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v58
; VI-NEXT: v_lshrrev_b64 v[58:59], 24, v[58:59]
-; VI-NEXT: v_add_f16_e32 v13, 0x200, v13
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v56, v13, v56
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v57
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v56
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v57
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v56
; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[56:57]
-; VI-NEXT: v_add_f16_e32 v11, 0x200, v11
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v31, v11, v31
+; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v31, v9, v31
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v32
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[31:32]
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v55
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v54
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[54:55]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v41
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v40
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[40:41]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v43
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v42
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v46
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v45
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v53
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v52
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51]
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[52:53]
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50
+; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51]
; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[42:43]
-; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[45:46]
-; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53]
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[48:49]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v38
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v37
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[37:38]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v36
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v35
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36]
-; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v54, v39
-; VI-NEXT: v_mov_b32_e32 v37, v44
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_mov_b32_e32 v56, v58
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[45:46]
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[33:34]
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v38
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v37
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[37:38]
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v57, v35
+; VI-NEXT: v_mov_b32_e32 v45, v44
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[33:34]
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[48:49]
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v48, v33
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[33:34]
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v41, v39
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[50:51]
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[33:34]
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51]
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v34
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v33
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v33, 8, 8
-; VI-NEXT: v_mov_b32_e32 v33, v47
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v33, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v39, 8, 8
-; VI-NEXT: v_mov_b32_e32 v39, v63
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v48, 8, 8
+; VI-NEXT: v_mov_b32_e32 v48, v63
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v48, 8, 8
; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51]
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_bfe_u32 v32, v63, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v41, 8, 8
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v49, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v32, v44, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_bfe_u32 v32, v47, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v44, v32
-; VI-NEXT: v_bfe_u32 v32, v32, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v36, 8, 8
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v55, v32
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v49, v47
+; VI-NEXT: v_mov_b32_e32 v44, v36
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v37, v32
; VI-NEXT: v_bfe_u32 v32, v32, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v36, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v47, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v51, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_bfe_u32 v32, v32, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfe_u32 v32, v32, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v58, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v57, 8, 8
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v59, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v58, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v55, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v32, v34, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v53, 8, 8
-; VI-NEXT: v_mov_b32_e32 v58, v57
-; VI-NEXT: v_mov_b32_e32 v57, v59
-; VI-NEXT: v_mov_b32_e32 v59, v34
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v32, v41, 8, 8
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v32, v46, 8, 8
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v54, v32
+; VI-NEXT: v_bfe_u32 v59, v32, 8, 8
; VI-NEXT: .LBB94_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v31
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32
+; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v52
+; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32
+; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v34, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50
-; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33
+; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52
+; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40
+; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -193562,10 +193630,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -193576,21 +193644,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -193601,8 +193671,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -193614,27 +193684,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -195917,7 +195987,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32
@@ -195939,459 +196013,489 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v28
+; SI-NEXT: v_mov_b32_e32 v28, v27
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v29, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v11
-; SI-NEXT: v_mov_b32_e32 v59, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v2
; SI-NEXT: v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v42, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v5
; SI-NEXT: v_cvt_f16_f32_e32 v61, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v59
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v59, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v18, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v31, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v59, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v10, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v29, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v23
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s17
; SI-NEXT: v_cvt_f16_f32_e32 v13, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v20, s19
; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
; SI-NEXT: v_cvt_f16_f32_e32 v16, s20
-; SI-NEXT: v_cvt_f16_f32_e32 v20, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v18, s23
; SI-NEXT: v_cvt_f16_f32_e32 v14, s22
; SI-NEXT: v_cvt_f16_f32_e32 v22, s25
; SI-NEXT: v_cvt_f16_f32_e32 v21, s24
-; SI-NEXT: v_cvt_f16_f32_e32 v15, s27
; SI-NEXT: v_cvt_f16_f32_e32 v19, s26
; SI-NEXT: v_cvt_f16_f32_e32 v24, s29
; SI-NEXT: v_cvt_f16_f32_e32 v23, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34
; SI-NEXT: v_cvt_f16_f32_e32 v34, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v17, s21
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v49
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_cvt_f16_f32_e32 v49, v51
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v52
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v53
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v54
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v40
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v55
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v44
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v40
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v44, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v17, s21
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v28, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v45
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v45, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v46, v57
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v57
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v28, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v48, s27
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v47
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB95_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_or_b32_e32 v13, v13, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v44
+; SI-NEXT: v_or_b32_e32 v47, v13, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20
; SI-NEXT: v_or_b32_e32 v55, v12, v11
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17
-; SI-NEXT: v_or_b32_e32 v57, v16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20
-; SI-NEXT: v_or_b32_e32 v17, v14, v11
+; SI-NEXT: v_or_b32_e32 v13, v16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18
+; SI-NEXT: v_or_b32_e32 v46, v14, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22
-; SI-NEXT: v_or_b32_e32 v21, v21, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15
+; SI-NEXT: v_or_b32_e32 v17, v21, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48
; SI-NEXT: v_or_b32_e32 v16, v19, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_or_b32_e32 v19, v23, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29
-; SI-NEXT: v_or_b32_e32 v47, v60, v11
+; SI-NEXT: v_mov_b32_e32 v24, v32
+; SI-NEXT: v_or_b32_e32 v22, v23, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24
+; SI-NEXT: v_or_b32_e32 v14, v60, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_or_b32_e32 v43, v42, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33
-; SI-NEXT: v_or_b32_e32 v14, v63, v11
+; SI-NEXT: v_mov_b32_e32 v21, v6
+; SI-NEXT: v_or_b32_e32 v23, v42, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21
+; SI-NEXT: v_or_b32_e32 v60, v63, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61
-; SI-NEXT: v_or_b32_e32 v42, v58, v11
+; SI-NEXT: v_or_b32_e32 v43, v3, v11
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v32, v31
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_mov_b32_e32 v31, v10
+; SI-NEXT: v_mov_b32_e32 v10, v9
+; SI-NEXT: v_mov_b32_e32 v9, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v60, v12, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3
-; SI-NEXT: v_or_b32_e32 v22, v2, v11
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v6, v3, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7
+; SI-NEXT: v_or_b32_e32 v12, v59, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52
-; SI-NEXT: v_or_b32_e32 v12, v46, v12
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v24, v2, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT: v_or_b32_e32 v4, v4, v11
+; SI-NEXT: v_or_b32_e32 v63, v5, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v58
+; SI-NEXT: v_or_b32_e32 v42, v15, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; SI-NEXT: v_or_b32_e32 v34, v34, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; SI-NEXT: v_or_b32_e32 v3, v59, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29
+; SI-NEXT: v_or_b32_e32 v5, v56, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9
-; SI-NEXT: v_or_b32_e32 v59, v56, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18
-; SI-NEXT: v_or_b32_e32 v6, v62, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7
-; SI-NEXT: v_or_b32_e32 v62, v25, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26
-; SI-NEXT: v_or_b32_e32 v2, v27, v11
+; SI-NEXT: v_or_b32_e32 v56, v62, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27
+; SI-NEXT: v_or_b32_e32 v3, v30, v11
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8
-; SI-NEXT: v_or_b32_e32 v25, v28, v11
+; SI-NEXT: v_or_b32_e32 v58, v25, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33
+; SI-NEXT: v_or_b32_e32 v26, v26, v11
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30
-; SI-NEXT: v_or_b32_e32 v1, v36, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; SI-NEXT: v_or_b32_e32 v23, v35, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v18, v39, v11
-; SI-NEXT: v_mov_b32_e32 v36, v2
-; SI-NEXT: v_mov_b32_e32 v35, v1
-; SI-NEXT: v_alignbit_b32 v1, v55, v13, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v25, v28, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v35
+; SI-NEXT: v_or_b32_e32 v29, v36, v11
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v62
+; SI-NEXT: v_or_b32_e32 v27, v38, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v39
+; SI-NEXT: v_or_b32_e32 v19, v37, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; SI-NEXT: v_or_b32_e32 v7, v50, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49
+; SI-NEXT: v_mov_b32_e32 v36, v5
+; SI-NEXT: v_or_b32_e32 v5, v52, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; SI-NEXT: v_or_b32_e32 v39, v51, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53
+; SI-NEXT: v_mov_b32_e32 v38, v3
+; SI-NEXT: v_or_b32_e32 v3, v54, v11
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_mov_b32_e32 v54, v7
+; SI-NEXT: v_alignbit_b32 v7, v55, v47, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v55, v13, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v55, v47, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v55, v13, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v55, v47, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v57, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v46, v13, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v57, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v46, v13, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v57, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v46, v13, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v16, v21, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v16, v17, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v16, v21, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v16, v17, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v16, v21, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v16, v17, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v47, v19, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v14, v22, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v47, v19, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v14, v22, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v47, v19, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v14, v22, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v14, v43, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v60, v23, 24
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v14, v43, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v60, v23, 16
+; SI-NEXT: v_or_b32_e32 v50, v44, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40
+; SI-NEXT: v_mov_b32_e32 v40, v43
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v14, v43, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v60, v23, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v60, v42, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v40, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v60, v42, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v40, 16
+; SI-NEXT: v_mov_b32_e32 v53, v12
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v60, v42, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v40, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v24, v22, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v63, v53, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v24, v22, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v63, v53, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v24, v22, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v34, v42, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v34, v4, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v34, v42, 16
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v7, v34, v42, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v34, v4, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v56, v36, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v34, v4, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v56, v36, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v59, v3, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v56, v36, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v59, v3, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v58, v38, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v59, v3, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v34
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38
-; SI-NEXT: v_or_b32_e32 v61, v50, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49
-; SI-NEXT: v_or_b32_e32 v2, v48, v11
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v49, v6
+; SI-NEXT: v_alignbit_b32 v7, v58, v38, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v62, v49, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v58, v38, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v62, v49, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v25, v26, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v62, v49, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v25, v26, 16
+; SI-NEXT: v_or_b32_e32 v15, v45, v11
+; SI-NEXT: v_mov_b32_e32 v45, v27
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v25, v36, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v25, v26, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v25, v36, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v45, v29, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v25, v36, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v45, v29, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v23, v35, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v45, v29, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v23, v35, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v54, v19, 24
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v41
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v23, v35, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v54, v19, 16
+; SI-NEXT: v_alignbit_b32 v11, v54, v19, 8
+; SI-NEXT: v_or_b32_e32 v12, v57, v12
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v61, v18, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v7, v19
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v29, v39, v5, 24
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v61, v18, 16
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48
-; SI-NEXT: v_or_b32_e32 v58, v54, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51
-; SI-NEXT: v_or_b32_e32 v6, v53, v11
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v1, v61, v18, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v11, v39, v5, 16
+; SI-NEXT: v_mov_b32_e32 v19, v5
+; SI-NEXT: v_alignbit_b32 v5, v39, v5, 8
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v42, v50, v3, 24
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v58, v2, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v1, v58, v2, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v5, v50, v3, 16
+; SI-NEXT: v_mov_b32_e32 v59, v3
+; SI-NEXT: v_alignbit_b32 v57, v50, v3, 8
+; SI-NEXT: v_alignbit_b32 v3, v12, v15, 24
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v55
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v46
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v16
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v14
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v60
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v6
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v63
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v34
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v56
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v58
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v25
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61
-; SI-NEXT: v_or_b32_e32 v54, v40, v11
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v45
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v54
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v39
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v50
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v44, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v12
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v20, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v20, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v15, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v18, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v29, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v48, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v33, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v24, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v32, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v21, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v31, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v32, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v10, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v31, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v9, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v10, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v9, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v5, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v8, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v38, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v30, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v48, 8, 8
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v3, v62, 8, 8
+; SI-NEXT: v_mov_b32_e32 v44, v1
+; SI-NEXT: v_bfe_u32 v1, v1, 8, 8
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v37, 8, 8
-; SI-NEXT: v_or_b32_e32 v11, v45, v11
+; SI-NEXT: v_bfe_u32 v3, v4, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v52, 8, 8
-; SI-NEXT: v_alignbit_b32 v28, v58, v2, 24
-; SI-NEXT: v_alignbit_b32 v2, v54, v6, 24
-; SI-NEXT: v_alignbit_b32 v39, v54, v6, 16
-; SI-NEXT: v_alignbit_b32 v40, v54, v6, 8
-; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24
-; SI-NEXT: v_alignbit_b32 v56, v12, v11, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v30, v12, v11, 8
-; SI-NEXT: v_mov_b32_e32 v20, v29
-; SI-NEXT: v_mov_b32_e32 v15, v33
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_bfe_u32 v1, v41, 8, 8
+; SI-NEXT: v_alignbit_b32 v27, v63, v53, 16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v26, v12, v15, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v15
+; SI-NEXT: v_alignbit_b32 v43, v12, v15, 8
+; SI-NEXT: v_mov_b32_e32 v52, v20
+; SI-NEXT: v_mov_b32_e32 v20, v18
+; SI-NEXT: v_mov_b32_e32 v18, v48
+; SI-NEXT: v_mov_b32_e32 v15, v24
+; SI-NEXT: v_mov_b32_e32 v28, v21
+; SI-NEXT: v_mov_b32_e32 v21, v23
+; SI-NEXT: v_mov_b32_e32 v23, v22
+; SI-NEXT: v_mov_b32_e32 v22, v17
+; SI-NEXT: v_mov_b32_e32 v17, v13
+; SI-NEXT: v_mov_b32_e32 v13, v47
; SI-NEXT: v_mov_b32_e32 v32, v31
; SI-NEXT: v_mov_b32_e32 v31, v10
; SI-NEXT: v_mov_b32_e32 v10, v9
-; SI-NEXT: v_mov_b32_e32 v9, v7
-; SI-NEXT: v_bfe_u32 v29, v7, 8, 8
-; SI-NEXT: v_mov_b32_e32 v7, v8
-; SI-NEXT: v_mov_b32_e32 v8, v5
-; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v51, v8
+; SI-NEXT: v_mov_b32_e32 v48, v30
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v2
+; SI-NEXT: v_bfe_u32 v30, v2, 8, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v37, v62
; SI-NEXT: s_branch .LBB95_3
; SI-NEXT: .LBB95_2:
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v52, v20
+; SI-NEXT: v_mov_b32_e32 v20, v18
+; SI-NEXT: v_mov_b32_e32 v18, v48
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_mov_b32_e32 v44, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -196537,391 +196641,390 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_mov_b32_e32 v20, v29
-; SI-NEXT: v_mov_b32_e32 v15, v33
-; SI-NEXT: v_mov_b32_e32 v33, v32
-; SI-NEXT: v_mov_b32_e32 v32, v31
-; SI-NEXT: v_mov_b32_e32 v31, v10
-; SI-NEXT: v_mov_b32_e32 v10, v9
-; SI-NEXT: v_mov_b32_e32 v9, v7
-; SI-NEXT: v_mov_b32_e32 v7, v8
-; SI-NEXT: v_mov_b32_e32 v8, v5
-; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v15, v32
+; SI-NEXT: v_mov_b32_e32 v28, v6
+; SI-NEXT: v_mov_b32_e32 v33, v31
+; SI-NEXT: v_mov_b32_e32 v32, v10
+; SI-NEXT: v_mov_b32_e32 v31, v9
+; SI-NEXT: v_mov_b32_e32 v10, v8
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: v_mov_b32_e32 v3, v2
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: .LBB95_3: ; %Flow
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, v44
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v5, v8
-; SI-NEXT: v_mov_b32_e32 v6, v7
-; SI-NEXT: v_mov_b32_e32 v7, v9
-; SI-NEXT: v_mov_b32_e32 v8, v10
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_mov_b32_e32 v47, v48
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v2, v3
; SI-NEXT: v_mov_b32_e32 v9, v31
; SI-NEXT: v_mov_b32_e32 v31, v33
-; SI-NEXT: v_mov_b32_e32 v44, v15
-; SI-NEXT: v_mov_b32_e32 v33, v20
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v50, v2
-; SI-NEXT: v_mov_b32_e32 v53, v40
-; SI-NEXT: v_mov_b32_e32 v40, v28
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_mov_b32_e32 v2, v48
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v15
+; SI-NEXT: v_mov_b32_e32 v15, v18
+; SI-NEXT: v_mov_b32_e32 v18, v20
+; SI-NEXT: v_mov_b32_e32 v20, v52
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v3, v41
+; SI-NEXT: v_mov_b32_e32 v35, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v11, v27
-; SI-NEXT: v_mov_b32_e32 v38, v30
-; SI-NEXT: v_mov_b32_e32 v27, v52
-; SI-NEXT: v_mov_b32_e32 v30, v29
-; SI-NEXT: v_mov_b32_e32 v29, v26
+; SI-NEXT: v_mov_b32_e32 v4, v8
+; SI-NEXT: v_mov_b32_e32 v8, v10
+; SI-NEXT: v_mov_b32_e32 v10, v32
+; SI-NEXT: v_mov_b32_e32 v32, v28
+; SI-NEXT: v_mov_b32_e32 v28, v43
+; SI-NEXT: v_mov_b32_e32 v43, v42
+; SI-NEXT: v_mov_b32_e32 v42, v5
+; SI-NEXT: v_mov_b32_e32 v24, v29
+; SI-NEXT: v_mov_b32_e32 v29, v1
+; SI-NEXT: v_mov_b32_e32 v1, v19
+; SI-NEXT: v_mov_b32_e32 v19, v7
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_mov_b32_e32 v5, v37
; SI-NEXT: s_cbranch_vccnz .LBB95_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v29
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2
-; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v47
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v6
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v49
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v41
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v44
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34
-; SI-NEXT: v_or_b32_e32 v56, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v4
+; SI-NEXT: v_or_b32_e32 v11, v12, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT: v_or_b32_e32 v36, v14, v13
+; SI-NEXT: v_or_b32_e32 v59, v14, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT: v_or_b32_e32 v54, v14, v16
+; SI-NEXT: v_or_b32_e32 v55, v14, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_or_b32_e32 v52, v17, v16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v17, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_or_b32_e32 v58, v17, v19
-; SI-NEXT: v_alignbit_b32 v40, v58, v52, 24
+; SI-NEXT: v_or_b32_e32 v17, v17, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_or_b32_e32 v11, v21, v19
+; SI-NEXT: v_or_b32_e32 v19, v21, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v61, v21, v22
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v54, v21, v22
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_or_b32_e32 v16, v23, v22
+; SI-NEXT: v_or_b32_e32 v36, v23, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_or_b32_e32 v23, v23, v25
+; SI-NEXT: v_or_b32_e32 v45, v23, v25
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT: v_or_b32_e32 v48, v25, v24
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v6
+; SI-NEXT: v_or_b32_e32 v13, v25, v24
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_or_b32_e32 v25, v26, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v62
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: v_or_b32_e32 v53, v26, v27
+; SI-NEXT: v_or_b32_e32 v14, v26, v27
; SI-NEXT: v_mov_b32_e32 v26, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v16
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT: v_or_b32_e32 v62, v28, v27
+; SI-NEXT: v_or_b32_e32 v58, v28, v27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v27, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT: v_or_b32_e32 v59, v29, v34
+; SI-NEXT: v_or_b32_e32 v56, v29, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v29, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v51
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v29, v16
+; SI-NEXT: v_mov_b32_e32 v62, v29
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v7
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_or_b32_e32 v3, v30, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v44
+; SI-NEXT: v_or_b32_e32 v7, v30, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v52
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v13
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v48
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT: v_or_b32_e32 v4, v34, v30
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v32
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_mov_b32_e32 v30, v10
-; SI-NEXT: v_mov_b32_e32 v32, v30
+; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v41, v34, v30
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9
; SI-NEXT: v_or_b32_e32 v34, v35, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v46
-; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_or_b32_e32 v22, v35, v36
+; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; SI-NEXT: v_or_b32_e32 v53, v35, v36
; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10
-; SI-NEXT: v_mov_b32_e32 v35, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v63, v37, v36
+; SI-NEXT: v_mov_b32_e32 v36, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38
+; SI-NEXT: v_or_b32_e32 v40, v39, v37
+; SI-NEXT: v_mov_b32_e32 v38, v14
+; SI-NEXT: v_alignbit_b32 v27, v63, v53, 16
+; SI-NEXT: v_alignbit_b32 v30, v63, v53, 8
+; SI-NEXT: v_bfe_u32 v35, v2, 8, 8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_or_b32_e32 v24, v37, v36
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38
-; SI-NEXT: v_or_b32_e32 v42, v39, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v45
-; SI-NEXT: v_mov_b32_e32 v36, v48
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v7
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v48, v39
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v13
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_or_b32_e32 v60, v37, v39
+; SI-NEXT: v_or_b32_e32 v6, v37, v39
+; SI-NEXT: v_mov_b32_e32 v39, v17
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48
; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_alignbit_b32 v39, v54, v29, 16
-; SI-NEXT: v_or_b32_e32 v43, v48, v37
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT: v_or_b32_e32 v14, v49, v48
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v24, v39, v1, 24
+; SI-NEXT: v_or_b32_e32 v21, v48, v37
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v13
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v28, v14, v43, 8
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
+; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT: v_or_b32_e32 v60, v49, v48
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -196930,20 +197033,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT: v_or_b32_e32 v19, v48, v37
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v23, v48, v37
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33
-; SI-NEXT: v_or_b32_e32 v47, v49, v37
+; SI-NEXT: v_or_b32_e32 v14, v49, v37
; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT: v_or_b32_e32 v21, v50, v37
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v22, v50, v37
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -196953,26 +197056,27 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_or_b32_e32 v16, v37, v49
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48
; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_alignbit_b32 v50, v54, v29, 24
-; SI-NEXT: v_or_b32_e32 v57, v48, v37
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v50, v55
+; SI-NEXT: v_alignbit_b32 v43, v50, v59, 24
+; SI-NEXT: v_alignbit_b32 v42, v50, v59, 16
+; SI-NEXT: v_or_b32_e32 v17, v48, v37
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18
+; SI-NEXT: v_alignbit_b32 v57, v50, v59, 8
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT: v_or_b32_e32 v17, v49, v48
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v46, v49, v48
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT: v_mov_b32_e32 v49, v53
-; SI-NEXT: v_alignbit_b32 v53, v54, v29, 8
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
@@ -196983,573 +197087,575 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v13, v48, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20
; SI-NEXT: v_or_b32_e32 v55, v51, v37
-; SI-NEXT: v_alignbit_b32 v10, v55, v13, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v55, v13, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v55, v13, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v55, v13, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v17, v57, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v55, v13, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v17, v57, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v46, v17, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v17, v57, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v46, v17, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v16, v21, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v46, v17, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v16, v21, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v16, v22, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v16, v21, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v16, v22, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v47, v19, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v16, v22, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v47, v19, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v14, v23, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v47, v19, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v14, v23, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v14, v43, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v14, v23, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v14, v43, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v60, v21, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v60, v42, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v60, v21, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v60, v42, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v60, v21, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v60, v42, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v40, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v24, v22, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v40, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v24, v22, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v6, v40, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v24, v22, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v63, v53, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v34, v4, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v34, v41, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v34, v4, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v34, v41, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v34, v4, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v34, v41, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v59, v3, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v56, v36, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v59, v3, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v56, v36, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v59, v3, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v56, v36, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v62, v49, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v58, v38, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v62, v49, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v58, v38, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v62, v49, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v58, v38, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v25, v36, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v25, v26, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v25, v36, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v25, v26, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v7, v25, v26, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v25, v36, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v45, v28, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v23, v35, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v45, v28, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v7, v45, v28, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v23, v35, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v54, v19, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v23, v35, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v54, v19, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v61, v11, 24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v54, v19, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v61, v11, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v10, v61, v11, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v39, v1, 16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v10, v58, v52, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v10, v58, v52, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v7, v39, v1, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v10, v56
-; SI-NEXT: v_alignbit_b32 v11, v12, v10, 24
-; SI-NEXT: v_alignbit_b32 v56, v12, v10, 16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v38, v12, v10, 8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v55
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v55
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v46
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v17
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v16
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v47
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v60
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v14
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v6
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v60
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v63
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v24
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v34
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v59
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v56
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v62
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v58
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v25
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v23
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v45
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v54
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v58
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v39
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v54
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v50
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v12
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v12
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v20, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v20, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v18, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v18, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v15, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v15, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v33, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v33, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v44, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v32, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v31, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v31, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v30, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v10, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v9, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v9, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v8, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v8, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v6, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v29, 8, 8
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v5, 8, 8
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_bfe_u32 v10, v26, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v47, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v2, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v5, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v1, 8, 8
-; SI-NEXT: v_alignbit_b32 v48, v55, v13, 24
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34
-; SI-NEXT: v_bfe_u32 v30, v7, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v7, v4, 8, 8
+; SI-NEXT: v_alignbit_b32 v26, v12, v11, 24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v10, v27, 8, 8
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16
+; SI-NEXT: v_alignbit_b32 v28, v12, v11, 8
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_bfe_u32 v7, v29, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_bfe_u32 v7, v3, 8, 8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: .LBB95_5: ; %end
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v37, 0xff, v13
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v48
; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v37, v37, v51
; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v51, 0xff, v10
+; SI-NEXT: v_and_b32_e32 v51, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v7
; SI-NEXT: v_or_b32_e32 v51, v52, v51
; SI-NEXT: v_or_b32_e32 v37, v37, v51
; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v37, 0xff, v55
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v37, v37, v51
; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v7
; SI-NEXT: v_or_b32_e32 v20, v48, v20
; SI-NEXT: v_or_b32_e32 v20, v37, v20
; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0
; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v57
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v20, v20, v37
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v37, 0xff, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v37, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v7
; SI-NEXT: v_or_b32_e32 v37, v48, v37
; SI-NEXT: v_or_b32_e32 v20, v20, v37
; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0
; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v17
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v46
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v20, v20, v37
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v37, v18
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0
; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v21
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v18, v18, v20
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7
; SI-NEXT: v_or_b32_e32 v20, v37, v20
; SI-NEXT: v_or_b32_e32 v18, v18, v20
; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0
; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xff, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v18, v18, v20
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v15, v20, v15
; SI-NEXT: v_or_b32_e32 v15, v18, v15
; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v23
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v47
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_and_b32_e32 v18, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v43
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v21
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v60
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v44
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v32
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v42
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v40
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v60
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v47
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_and_b32_e32 v18, 0xff, v31
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v22
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v53
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v30
; SI-NEXT: v_or_b32_e32 v15, v15, v18
-; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v27
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7
; SI-NEXT: v_or_b32_e32 v18, v20, v18
; SI-NEXT: v_or_b32_e32 v15, v15, v18
; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0
; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v24
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v32
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v63
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v15, v15, v18
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13
+; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v7
; SI-NEXT: v_or_b32_e32 v10, v18, v10
-; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: v_or_b32_e32 v10, v15, v10
; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0
; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v4
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v41
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v10, v10, v15
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13
+; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v7
; SI-NEXT: v_or_b32_e32 v15, v18, v15
; SI-NEXT: v_or_b32_e32 v10, v10, v15
; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0
; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xff, v34
-; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v10, v10, v15
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v7
; SI-NEXT: v_or_b32_e32 v9, v15, v9
; SI-NEXT: v_or_b32_e32 v9, v10, v9
; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0
; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v3
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v36
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v9, v9, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v7
; SI-NEXT: v_or_b32_e32 v10, v15, v10
; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0
; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v59
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v56
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v3
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v7
; SI-NEXT: v_or_b32_e32 v8, v10, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v49
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v38
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v4
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v7
; SI-NEXT: v_or_b32_e32 v9, v10, v9
; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v62
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v58
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v7
; SI-NEXT: v_or_b32_e32 v8, v8, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v62
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; SI-NEXT: v_or_b32_e32 v7, v9, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v36
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
@@ -197562,24 +197668,26 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xff, v25
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4
-; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; SI-NEXT: v_or_b32_e32 v6, v8, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v35
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; SI-NEXT: v_or_b32_e32 v6, v6, v7
@@ -197594,25 +197702,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v23
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v4
-; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v45
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; SI-NEXT: v_or_b32_e32 v5, v7, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v19
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v6
@@ -197627,17 +197734,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v61
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v54
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
@@ -197645,11 +197748,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v29
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; SI-NEXT: v_or_b32_e32 v4, v4, v5
@@ -197664,27 +197768,23 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v58
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v35
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; SI-NEXT: v_or_b32_e32 v2, v5, v2
; SI-NEXT: v_or_b32_e32 v2, v4, v2
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v50
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v59
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v57
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v39
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v42
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v43
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
@@ -197692,7 +197792,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v54
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v50
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v4
@@ -197704,33 +197804,33 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v56
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v26
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_or_b32_e32 v2, v4, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v12
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v27
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -197757,8 +197857,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v63, s30, 0
; VI-NEXT: v_writelane_b32 v63, s31, 1
@@ -197988,112 +198088,117 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s46, s45, 16
; VI-NEXT: v_mov_b32_e32 v7, 0x200
; VI-NEXT: v_add_f16_e32 v1, s46, v7
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s45, v7
; VI-NEXT: s_lshr_b32 s45, s44, 16
; VI-NEXT: v_or_b32_e32 v23, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s45, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s44, v7
; VI-NEXT: s_lshr_b32 s44, s43, 16
; VI-NEXT: v_or_b32_e32 v22, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s44, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s43, v7
; VI-NEXT: s_lshr_b32 s43, s42, 16
; VI-NEXT: v_or_b32_e32 v25, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s43, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s42, v7
; VI-NEXT: s_lshr_b32 s42, s41, 16
; VI-NEXT: v_or_b32_e32 v24, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s42, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s41, v7
; VI-NEXT: s_lshr_b32 s41, s40, 16
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v27, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s41, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s40, v7
; VI-NEXT: s_lshr_b32 s40, s15, 16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_add_f16_e32 v53, s40, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v26, v2, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53
+; VI-NEXT: v_add_f16_e32 v1, s40, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s15, v7
; VI-NEXT: s_lshr_b32 s15, s14, 16
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v29, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s15, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s14, v7
; VI-NEXT: s_lshr_b32 s14, s13, 16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_add_f16_e32 v43, s14, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v28, v2, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43
+; VI-NEXT: v_add_f16_e32 v1, s14, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s13, v7
; VI-NEXT: s_lshr_b32 s13, s12, 16
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v6, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s13, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s12, v7
; VI-NEXT: s_lshr_b32 s12, s11, 16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_add_f16_e32 v37, s12, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v5, v2, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37
+; VI-NEXT: v_add_f16_e32 v1, s12, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s11, v7
; VI-NEXT: s_lshr_b32 s11, s10, 16
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v31, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s11, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s10, v7
; VI-NEXT: s_lshr_b32 s10, s9, 16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_add_f16_e32 v52, s10, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v30, v2, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52
+; VI-NEXT: v_add_f16_e32 v1, s10, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s9, v7
; VI-NEXT: s_lshr_b32 s9, s8, 16
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v4, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s9, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s8, v7
; VI-NEXT: s_lshr_b32 s8, s7, 16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_add_f16_e32 v50, s8, v7
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v3, v2, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50
+; VI-NEXT: v_add_f16_e32 v1, s8, v7
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v2, s7, v7
; VI-NEXT: s_lshr_b32 s7, s6, 16
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v2, v2, v1
; VI-NEXT: v_add_f16_e32 v1, s7, v7
; VI-NEXT: v_add_f16_e32 v8, s6, v7
; VI-NEXT: s_lshr_b32 s6, s17, 16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_add_f16_e32 v36, s6, v7
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v1, v8, v1
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36
; VI-NEXT: v_add_f16_e32 v9, s17, v7
@@ -198101,12 +198206,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_e32 v33, v9, v8
; VI-NEXT: v_add_f16_e32 v8, s6, v7
; VI-NEXT: s_lshr_b32 s6, s19, 16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s16, v7
; VI-NEXT: v_add_f16_e32 v38, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v32, v9, v8
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38
; VI-NEXT: v_add_f16_e32 v9, s19, v7
@@ -198114,12 +198219,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_e32 v21, v9, v8
; VI-NEXT: v_add_f16_e32 v8, s6, v7
; VI-NEXT: s_lshr_b32 s6, s21, 16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s18, v7
; VI-NEXT: v_add_f16_e32 v61, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v20, v9, v8
; VI-NEXT: s_lshr_b32 s7, s20, 16
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61
@@ -198127,12 +198232,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_e32 v35, v9, v8
; VI-NEXT: v_add_f16_e32 v8, s7, v7
; VI-NEXT: s_lshr_b32 s6, s23, 16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s20, v7
; VI-NEXT: v_add_f16_e32 v45, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v34, v9, v8
; VI-NEXT: s_lshr_b32 s7, s22, 16
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45
@@ -198140,12 +198245,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_e32 v19, v9, v8
; VI-NEXT: v_add_f16_e32 v8, s7, v7
; VI-NEXT: s_lshr_b32 s6, s25, 16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s22, v7
; VI-NEXT: v_add_f16_e32 v47, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v18, v9, v8
; VI-NEXT: s_lshr_b32 s7, s24, 16
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47
@@ -198153,12 +198258,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_e32 v16, v9, v8
; VI-NEXT: v_add_f16_e32 v8, s7, v7
; VI-NEXT: s_lshr_b32 s6, s27, 16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s24, v7
; VI-NEXT: v_add_f16_e32 v57, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v15, v9, v8
; VI-NEXT: s_lshr_b32 s7, s26, 16
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57
@@ -198166,12 +198271,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_e32 v13, v9, v8
; VI-NEXT: v_add_f16_e32 v8, s7, v7
; VI-NEXT: s_lshr_b32 s6, s29, 16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s26, v7
; VI-NEXT: v_add_f16_e32 v59, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v12, v9, v8
; VI-NEXT: s_lshr_b32 s7, s28, 16
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59
@@ -198181,96 +198286,80 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_add_f16_e32 v8, s7, v7
; VI-NEXT: s_lshr_b32 s7, s4, 16
; VI-NEXT: v_add_f16_e32 v51, s6, v7
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_add_f16_e32 v9, s28, v7
; VI-NEXT: v_add_f16_e32 v54, s5, v7
-; VI-NEXT: v_add_f16_e32 v11, s7, v7
+; VI-NEXT: v_add_f16_e32 v53, s7, v7
; VI-NEXT: v_add_f16_e32 v55, s4, v7
; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_or_b32_e32 v9, v9, v8
; VI-NEXT: v_or_b32_e32 v8, v54, v7
-; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v53
; VI-NEXT: v_or_b32_e32 v7, v55, v7
; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v8
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v7
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7
; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v10
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v9
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v13
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v2
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4]
; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v16
; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v13
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13]
-; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v18
; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4]
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v35
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v15
+; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v19
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v35
; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35]
; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23
; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23]
-; VI-NEXT: v_bfe_u32 v23, v50, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v23, v52, 8, 8
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v23, v37, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v23, v43, 8, 8
-; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v23, v53, 8, 8
-; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25]
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31]
-; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20
+; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v10
+; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10]
+; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v20
; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21]
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5
; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21
+; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v9
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v34
+; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v21
+; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v32
; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33]
-; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v6
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29
; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29]
+; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26
; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[26:27]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32
-; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25]
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v33
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26
; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v25
; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22
; VI-NEXT: v_bfe_u32 v25, v51, 8, 8
@@ -198282,11 +198371,31 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_bfe_u32 v22, v38, 8, 8
; VI-NEXT: v_bfe_u32 v2, v36, 8, 8
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_bfe_u32 v26, v50, 8, 8
-; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_bfe_u32 v23, v23, 8, 8
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_bfe_u32 v24, v24, 8, 8
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_bfe_u32 v26, v26, 8, 8
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v23, v23, 8, 8
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v23, v23, 8, 8
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v23, v23, 8, 8
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v23, v23, 8, 8
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v23, v23, 8, 8
; VI-NEXT: s_branch .LBB95_5
; VI-NEXT: .LBB95_3:
; VI-NEXT: ; implicit-def: $sgpr46
@@ -198446,133 +198555,136 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: s_branch .LBB95_2
; VI-NEXT: .LBB95_4:
; VI-NEXT: v_mov_b32_e32 v1, s44
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s45
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s42
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s43
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s40
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s41
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s14
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s15
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s12
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s13
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s11
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s16
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s18
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s19
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s20
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s22
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s23
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s24
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s25
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s26
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s27
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s28
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s71
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s70
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s69
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s68
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s67
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s66
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s65
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s64
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s55
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s54
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s87
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s86
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s85
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s84
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s53
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s52
; VI-NEXT: v_readlane_b32 s6, v62, 0
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_readlane_b32 s6, v62, 1
; VI-NEXT: v_mov_b32_e32 v36, s6
; VI-NEXT: v_readlane_b32 s6, v62, 2
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_readlane_b32 s6, v62, 3
; VI-NEXT: v_mov_b32_e32 v38, s6
; VI-NEXT: v_readlane_b32 s6, v62, 4
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_readlane_b32 s6, v62, 5
; VI-NEXT: v_mov_b32_e32 v61, s6
; VI-NEXT: v_readlane_b32 s6, v62, 6
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_readlane_b32 s6, v62, 7
; VI-NEXT: v_mov_b32_e32 v45, s6
; VI-NEXT: v_readlane_b32 s6, v62, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_readlane_b32 s6, v62, 9
; VI-NEXT: v_mov_b32_e32 v47, s6
; VI-NEXT: v_readlane_b32 s6, v62, 10
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_readlane_b32 s6, v62, 11
-; VI-NEXT: v_mov_b32_e32 v57, s6
-; VI-NEXT: v_readlane_b32 s6, v62, 12
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: v_readlane_b32 s6, v62, 13
; VI-NEXT: v_mov_b32_e32 v55, s4
; VI-NEXT: v_readlane_b32 s4, v62, 16
-; VI-NEXT: v_mov_b32_e32 v59, s6
-; VI-NEXT: v_readlane_b32 s6, v62, 14
+; VI-NEXT: v_mov_b32_e32 v57, s6
+; VI-NEXT: v_readlane_b32 s6, v62, 12
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_readlane_b32 s4, v62, 17
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_mov_b32_e32 v22, s4
; VI-NEXT: v_readlane_b32 s4, v62, 18
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_readlane_b32 s4, v62, 19
; VI-NEXT: v_mov_b32_e32 v15, s4
@@ -198587,77 +198699,74 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_readlane_b32 s4, v62, 24
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 25
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 26
-; VI-NEXT: v_mov_b32_e32 v46, s4
+; VI-NEXT: v_mov_b32_e32 v58, s4
; VI-NEXT: v_readlane_b32 s4, v62, 27
-; VI-NEXT: v_mov_b32_e32 v41, s4
+; VI-NEXT: v_mov_b32_e32 v56, s4
; VI-NEXT: v_readlane_b32 s4, v62, 28
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s80
-; VI-NEXT: v_mov_b32_e32 v60, s4
+; VI-NEXT: v_mov_b32_e32 v44, s4
; VI-NEXT: v_readlane_b32 s4, v62, 29
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s81
-; VI-NEXT: v_mov_b32_e32 v40, s4
+; VI-NEXT: v_mov_b32_e32 v49, s4
; VI-NEXT: v_readlane_b32 s4, v62, 30
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s82
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, s80
; VI-NEXT: v_mov_b32_e32 v39, s4
; VI-NEXT: v_readlane_b32 s4, v62, 31
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, s81
+; VI-NEXT: v_mov_b32_e32 v37, s4
; VI-NEXT: v_readlane_b32 s4, v62, 32
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, s82
+; VI-NEXT: v_mov_b32_e32 v42, s4
; VI-NEXT: v_readlane_b32 s4, v62, 33
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 34
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: v_readlane_b32 s4, v62, 34
+; VI-NEXT: v_mov_b32_e32 v41, s4
; VI-NEXT: v_readlane_b32 s4, v62, 35
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: v_mov_b32_e32 v43, s4
; VI-NEXT: v_readlane_b32 s4, v62, 36
; VI-NEXT: v_mov_b32_e32 v48, s4
; VI-NEXT: v_readlane_b32 s4, v62, 37
-; VI-NEXT: v_mov_b32_e32 v49, s4
+; VI-NEXT: v_mov_b32_e32 v60, s4
; VI-NEXT: v_readlane_b32 s4, v62, 38
-; VI-NEXT: v_mov_b32_e32 v44, s4
+; VI-NEXT: v_mov_b32_e32 v46, s4
; VI-NEXT: v_readlane_b32 s4, v62, 39
-; VI-NEXT: v_mov_b32_e32 v42, s4
+; VI-NEXT: v_mov_b32_e32 v50, s4
; VI-NEXT: v_readlane_b32 s4, v62, 40
-; VI-NEXT: v_mov_b32_e32 v56, s4
+; VI-NEXT: v_mov_b32_e32 v40, s4
; VI-NEXT: v_readlane_b32 s4, v62, 41
-; VI-NEXT: v_mov_b32_e32 v58, s4
+; VI-NEXT: v_mov_b32_e32 v52, s4
; VI-NEXT: v_readlane_b32 s4, v62, 42
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 43
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 44
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 45
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 46
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_readlane_b32 s4, v62, 47
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, s78
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; VI-NEXT: v_readlane_b32 s4, v62, 48
; VI-NEXT: v_mov_b32_e32 v31, s4
; VI-NEXT: v_readlane_b32 s4, v62, 49
-; VI-NEXT: v_mov_b32_e32 v30, s4
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, s78
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; VI-NEXT: v_readlane_b32 s4, v62, 50
; VI-NEXT: v_mov_b32_e32 v33, s4
; VI-NEXT: v_readlane_b32 s4, v62, 51
@@ -198667,20 +198776,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_readlane_b32 s4, v62, 53
; VI-NEXT: v_mov_b32_e32 v28, s4
; VI-NEXT: v_readlane_b32 s4, v62, 54
+; VI-NEXT: v_readlane_b32 s6, v62, 13
; VI-NEXT: v_mov_b32_e32 v34, s4
; VI-NEXT: v_readlane_b32 s4, v62, 55
+; VI-NEXT: v_mov_b32_e32 v59, s6
+; VI-NEXT: v_readlane_b32 s6, v62, 14
; VI-NEXT: v_mov_b32_e32 v9, s4
; VI-NEXT: v_readlane_b32 s4, v62, 56
; VI-NEXT: v_mov_b32_e32 v3, s88
+; VI-NEXT: v_mov_b32_e32 v53, s6
; VI-NEXT: v_readlane_b32 s6, v62, 15
; VI-NEXT: v_mov_b32_e32 v21, s4
; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v50, s70
-; VI-NEXT: v_mov_b32_e32 v43, s54
-; VI-NEXT: v_mov_b32_e32 v37, s86
-; VI-NEXT: v_mov_b32_e32 v52, s84
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v51, s6
; VI-NEXT: v_mov_b32_e32 v54, s5
; VI-NEXT: v_mov_b32_e32 v23, s83
@@ -198702,8 +198811,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v11, s38
; VI-NEXT: v_mov_b32_e32 v14, s48
; VI-NEXT: .LBB95_5: ; %end
-; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58
+; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v52
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198744,31 +198853,31 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v52, v30, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v20, v30, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v20, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v58
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v20, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0
; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v40
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0
; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v22
@@ -198776,36 +198885,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0
; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0
; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15
@@ -198813,9 +198922,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16
@@ -198824,7 +198933,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198833,8 +198942,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198844,10 +198953,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
@@ -198855,11 +198962,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v43
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10
@@ -198868,10 +198973,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v42
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27
@@ -198879,18 +198982,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25
; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -198899,8 +198998,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198910,17 +199009,19 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198930,8 +199031,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198941,8 +199042,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198952,8 +199053,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198963,86 +199064,90 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v30
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -199050,28 +199155,29 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -199079,15 +199185,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199108,8 +199214,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -199525,7 +199631,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22
@@ -199533,7 +199639,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22
@@ -199832,7 +199938,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_readlane_b32 s4, v62, 22
; GFX9-NEXT: v_mov_b32_e32 v60, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 23
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v17, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 24
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
@@ -199840,7 +199946,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: v_readlane_b32 s4, v62, 25
; GFX9-NEXT: v_mov_b32_e32 v23, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 26
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v17, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 27
; GFX9-NEXT: v_mov_b32_e32 v59, s4
@@ -200110,14 +200216,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -202550,19 +202656,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
@@ -202588,431 +202694,439 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; kill: killed $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:360
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25
-; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300
+; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108
-; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v34
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108
+; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v11
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4
+; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8
+; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:336
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9
+; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:368
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v16
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:384
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:384
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11
-; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v15
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8
+; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v17
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v7
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
-; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13
-; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48
+; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; kill: killed $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -203053,19 +203167,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v12, v1, v2
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_or_b32_e32 v7, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v42, v2, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -203074,7 +203195,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v5, v2, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -203082,488 +203203,444 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v7, v16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v26, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v11, v2, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v40, v1, v2
+; SI-NEXT: v_or_b32_e32 v42, v1, v2
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v10, v24, v1
+; SI-NEXT: v_or_b32_e32 v39, v24, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v15, v2, v1
+; SI-NEXT: v_or_b32_e32 v17, v2, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v26, v1, v2
+; SI-NEXT: v_or_b32_e32 v55, v1, v2
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v18, v25, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v18, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v19, v2, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v20, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v30, v2, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v28, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v21, v2, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v27, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v29, v1, v2
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v33, v2, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v30, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v27, v2, v1
+; SI-NEXT: v_or_b32_e32 v53, v2, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v19, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v38, v1, v2
+; SI-NEXT: v_or_b32_e32 v33, v2, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v52, v1, v2
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v43, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v23
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v15, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v2, v1, v2
+; SI-NEXT: v_or_b32_e32 v38, v2, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v45, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v55, v1, v6
+; SI-NEXT: v_or_b32_e32 v47, v1, v2
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v17, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v14, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v44, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v45, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v57, v3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v25, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v59, v1, v3
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v4, v3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v47, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v62, v3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT: v_or_b32_e32 v44, v1, v3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v58, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v16, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v59, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v8, v1, v3
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v62, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v1, v3
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v24, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v6, v3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v13, v1, v6
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v3, v1, v3
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v6, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v16, v1, v12
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v23, v1, v8
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v24, v12, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v36, v1, v8
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_mov_b32_e32 v1, v32
+; SI-NEXT: v_or_b32_e32 v32, v10, v12
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v8, v8, v35
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_mov_b32_e32 v10, v34
+; SI-NEXT: v_or_b32_e32 v34, v22, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v35, v37, v22
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_or_b32_e32 v36, v12, v35
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v37, v51, v22
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v35, v37, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v51, v22, v63
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v34
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v34, v56, v22
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v37, v51, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v56, v60, v22
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v54
-; SI-NEXT: v_or_b32_e32 v54, v22, v4
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v41
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v9, v9, v22
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v57
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v32, v4
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_or_b32_e32 v51, v12, v60
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v14, v31, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v31, v61, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v54
+; SI-NEXT: v_or_b32_e32 v54, v12, v23
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v46
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v23, v25, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v41
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v13, v13, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v28, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v32, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v25, v22, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v39, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v40, v22, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v41, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v57, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v46, v22, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v60, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v63, v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v61, v22, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v1, v22, v1
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v55
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v1, v45
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v48, v22, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v10, v22, v10
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v46, v22, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v49, v22, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT: v_or_b32_e32 v48, v22, v48
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v50, v22, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT: v_or_b32_e32 v56, v22, v49
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v20
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v49, v22, v49
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v20, v22, v20
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT: v_or_b32_e32 v50, v22, v50
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: v_or_b32_e32 v53, v22, v53
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v58, v22, v58
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: v_or_b32_e32 v3, v22, v3
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v61
-; SI-NEXT: v_mov_b32_e32 v61, v42
-; SI-NEXT: v_or_b32_e32 v31, v22, v31
-; SI-NEXT: v_or_b32_e32 v22, v12, v61
-; SI-NEXT: v_and_b32_e32 v12, 0xffff, v28
-; SI-NEXT: v_or_b32_e32 v43, v12, v5
+; SI-NEXT: v_or_b32_e32 v9, v22, v9
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v52
+; SI-NEXT: v_or_b32_e32 v52, v22, v63
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v22, v7, v63
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v12
+; SI-NEXT: v_or_b32_e32 v12, v7, v5
; SI-NEXT: v_alignbit_b32 v5, v22, v5, 16
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v12, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v25
; SI-NEXT: v_or_b32_e32 v7, v7, v11
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v49
-; SI-NEXT: v_or_b32_e32 v32, v32, v59
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mov_b32_e32 v25, v39
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: v_or_b32_e32 v5, v5, v12
+; SI-NEXT: v_or_b32_e32 v5, v5, v26
; SI-NEXT: v_alignbit_b32 v11, v5, v11, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v42
+; SI-NEXT: v_or_b32_e32 v42, v11, v25
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40
-; SI-NEXT: v_or_b32_e32 v42, v11, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v39
-; SI-NEXT: v_or_b32_e32 v40, v11, v15
-; SI-NEXT: v_alignbit_b32 v11, v42, v15, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26
-; SI-NEXT: v_or_b32_e32 v26, v11, v18
+; SI-NEXT: v_or_b32_e32 v40, v11, v17
+; SI-NEXT: v_alignbit_b32 v11, v42, v17, 16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v55
+; SI-NEXT: v_or_b32_e32 v55, v11, v18
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v41
-; SI-NEXT: v_or_b32_e32 v39, v11, v19
-; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v39, v11, v20
+; SI-NEXT: v_alignbit_b32 v11, v55, v20, 16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: v_mov_b32_e32 v41, v28
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: v_or_b32_e32 v28, v11, v30
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v57
-; SI-NEXT: v_or_b32_e32 v11, v11, v21
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v28, v11, v41
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v46
+; SI-NEXT: v_or_b32_e32 v20, v11, v27
+; SI-NEXT: v_alignbit_b32 v11, v28, v27, 16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29
-; SI-NEXT: v_or_b32_e32 v29, v11, v33
+; SI-NEXT: v_or_b32_e32 v29, v11, v30
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v60
-; SI-NEXT: v_or_b32_e32 v21, v11, v27
-; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v27, v11, v53
+; SI-NEXT: v_alignbit_b32 v11, v29, v53, 16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v19
+; SI-NEXT: v_or_b32_e32 v19, v11, v33
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v61
+; SI-NEXT: v_or_b32_e32 v11, v11, v43
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v11, v19, v43, 16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v15
+; SI-NEXT: v_or_b32_e32 v11, v11, v38
+; SI-NEXT: v_alignbit_b32 v1, v11, v45, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: v_or_b32_e32 v19, v11, v38
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v63
-; SI-NEXT: v_or_b32_e32 v27, v11, v52
-; SI-NEXT: v_alignbit_b32 v11, v19, v52, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47
+; SI-NEXT: v_or_b32_e32 v15, v1, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT: v_or_b32_e32 v1, v1, v57
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: v_or_b32_e32 v11, v11, v2
-; SI-NEXT: v_alignbit_b32 v1, v11, v55, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v15, v57, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17
-; SI-NEXT: v_or_b32_e32 v15, v1, v14
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59
+; SI-NEXT: v_or_b32_e32 v17, v1, v4
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT: v_or_b32_e32 v1, v1, v44
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v44, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v1, v62
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45
-; SI-NEXT: v_or_b32_e32 v17, v1, v25
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46
-; SI-NEXT: v_or_b32_e32 v1, v1, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v17, v62, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v17, v47, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58
-; SI-NEXT: v_or_b32_e32 v1, v1, v16
-; SI-NEXT: v_alignbit_b32 v32, v1, v59, 16
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v62
-; SI-NEXT: v_or_b32_e32 v59, v6, v23
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20
-; SI-NEXT: v_or_b32_e32 v62, v32, v24
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v50
-; SI-NEXT: v_or_b32_e32 v50, v6, v36
-; SI-NEXT: v_alignbit_b32 v6, v59, v36, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8
-; SI-NEXT: v_or_b32_e32 v47, v6, v35
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53
-; SI-NEXT: v_or_b32_e32 v49, v6, v37
-; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51
-; SI-NEXT: v_or_b32_e32 v45, v6, v34
-; SI-NEXT: v_or_b32_e32 v48, v3, v56
-; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54
-; SI-NEXT: v_or_b32_e32 v44, v3, v4
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31
-; SI-NEXT: v_or_b32_e32 v3, v3, v9
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
-; SI-NEXT: v_mov_b32_e32 v14, v3
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v56
+; SI-NEXT: v_or_b32_e32 v1, v1, v8
+; SI-NEXT: v_or_b32_e32 v10, v10, v21
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v10, v1, v21, 16
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v62, v6, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
@@ -203747,133 +203824,175 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v6, v6, v16
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v6, v62, v16, 16
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v59, v6, v32
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v50
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v56, v6, v34
+; SI-NEXT: v_alignbit_b32 v6, v59, v34, 16
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v36
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v47, v6, v35
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v50, v6, v37
+; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v45, v6, v14
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v9
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v49, v6, v31
+; SI-NEXT: v_alignbit_b32 v6, v45, v31, 16
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v54
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_or_b32_e32 v44, v6, v23
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: v_or_b32_e32 v46, v32, v13
-; SI-NEXT: v_alignbit_b32 v13, v62, v13, 16
-; SI-NEXT: v_alignbit_b32 v6, v44, v9, 16
+; SI-NEXT: v_or_b32_e32 v48, v6, v13
+; SI-NEXT: v_alignbit_b32 v6, v44, v13, 16
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v63
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v26
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v41
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v35
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: .LBB96_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB96_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61
+; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41
-; SI-NEXT: v_or_b32_e32 v1, v31, v1
+; SI-NEXT: v_or_b32_e32 v1, v63, v1
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT: v_or_b32_e32 v2, v9, v2
+; SI-NEXT: v_or_b32_e32 v2, v13, v2
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v2
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46
+; SI-NEXT: v_or_b32_e32 v2, v23, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: s_movk_i32 s6, 0x300
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT: v_or_b32_e32 v4, v32, v4
+; SI-NEXT: v_or_b32_e32 v3, v25, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v56, v5
-; SI-NEXT: v_mov_b32_e32 v30, v16
+; SI-NEXT: v_or_b32_e32 v5, v31, v5
; SI-NEXT: s_mov_b32 s7, 0x3000000
-; SI-NEXT: v_mov_b32_e32 v31, v24
+; SI-NEXT: v_mov_b32_e32 v30, v24
; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2
-; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v1
+; SI-NEXT: v_mov_b32_e32 v48, v31
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
+; SI-NEXT: v_or_b32_e32 v3, v9, v3
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3
+; SI-NEXT: v_or_b32_e32 v4, v61, v4
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
@@ -203882,41 +204001,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v37, v7
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v60, v4
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v48, v32
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_or_b32_e32 v4, v63, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_or_b32_e32 v5, v53, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v49, v33
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_or_b32_e32 v3, v60, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v5, v5, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_or_b32_e32 v3, v58, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
@@ -203924,13 +204029,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v50, v35
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -203939,16 +204047,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7
-; SI-NEXT: v_mov_b32_e32 v50, v6
+; SI-NEXT: v_mov_b32_e32 v56, v6
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -203957,15 +204065,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_or_b32_e32 v9, v10, v9
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -203974,16 +204082,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9
-; SI-NEXT: v_mov_b32_e32 v46, v8
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10
; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -203992,15 +204099,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204009,15 +204116,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v13, v12
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12
; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v12, v13, v12
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204026,15 +204133,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_or_b32_e32 v13, v14, v13
; SI-NEXT: v_or_b32_e32 v12, v13, v12
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v12
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13
; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204043,45 +204150,48 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14
; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_or_b32_e32 v15, v17, v15
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v15, v17, v15
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_or_b32_e32 v16, v17, v16
; SI-NEXT: v_or_b32_e32 v15, v16, v15
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v15
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v16, v17, v16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204090,9 +204200,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_or_b32_e32 v17, v18, v17
; SI-NEXT: v_or_b32_e32 v16, v17, v16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v15
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
@@ -204107,30 +204218,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_or_b32_e32 v18, v19, v18
; SI-NEXT: v_or_b32_e32 v18, v18, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v18
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v23
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17
; SI-NEXT: v_or_b32_e32 v19, v20, v19
-; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: v_or_b32_e32 v19, v19, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v17, v20, v17
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17
@@ -204142,13 +204253,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v20, v21, v20
; SI-NEXT: v_or_b32_e32 v20, v20, v17
; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v17, v21, v17
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17
; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204157,15 +204269,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v21, v22, v21
; SI-NEXT: v_or_b32_e32 v21, v21, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v21
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v17, v22, v17
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17
; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204174,32 +204286,31 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; SI-NEXT: v_or_b32_e32 v22, v23, v22
; SI-NEXT: v_or_b32_e32 v22, v22, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v22
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v22
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v17, v23, v17
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17
; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
; SI-NEXT: v_and_b32_e32 v23, 0xff, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_or_b32_e32 v23, v26, v23
+; SI-NEXT: v_or_b32_e32 v23, v25, v23
; SI-NEXT: v_or_b32_e32 v23, v23, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v26, v25
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v23
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
; SI-NEXT: v_or_b32_e32 v24, v24, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v24
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204208,16 +204319,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_or_b32_e32 v25, v25, v17
; SI-NEXT: v_or_b32_e32 v2, v25, v2
-; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14
-; SI-NEXT: v_mov_b32_e32 v14, v27
-; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20
+; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204227,10 +204337,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v28
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -204247,8 +204357,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -204261,16 +204371,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v26, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v55
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
@@ -204283,8 +204394,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v40, vcc, s7, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -204297,30 +204408,29 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v31, v3
+; SI-NEXT: v_or_b32_e32 v3, v30, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v42, vcc, s7, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v42
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_or_b32_e32 v2, v34, v2
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -204333,28 +204443,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v30, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_or_b32_e32 v2, v32, v2
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2
+; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
@@ -204363,7 +204472,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -204373,236 +204482,239 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v2
-; SI-NEXT: v_alignbit_b32 v2, v22, v43, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v2, v22, v12, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v2, v5, v7, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v2, v42, v40, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v26, v39, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v28, v18, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v29, v21, 16
+; SI-NEXT: v_alignbit_b32 v2, v55, v39, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16
+; SI-NEXT: v_alignbit_b32 v2, v28, v20, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v11, v16, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v2, v29, v27, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16
+; SI-NEXT: v_alignbit_b32 v2, v19, v16, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16
+; SI-NEXT: v_alignbit_b32 v2, v11, v14, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16
+; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v47, v33, 16
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v45, v32, 16
+; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v44, v14, 16
+; SI-NEXT: v_alignbit_b32 v2, v47, v35, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v2, v45, v33, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v2, v44, v31, 16
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v45
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
; SI-NEXT: .LBB96_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v3, v3, v9
; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57
+; SI-NEXT: v_or_b32_e32 v3, v3, v9
+; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v55
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -204614,9 +204726,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -204628,9 +204740,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -204638,13 +204750,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -204652,19 +204764,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
@@ -208877,8 +208989,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -208896,21 +209008,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:328
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:320
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:308
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:304
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:292
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:288
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:332
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:324
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:320
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:308
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:304
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:300
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:296
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:292
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256
; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244
@@ -208920,22 +209032,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:228
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:196
-; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196
; SI-NEXT: v_writelane_b32 v63, s30, 0
-; SI-NEXT: v_writelane_b32 v62, s28, 0
-; SI-NEXT: v_writelane_b32 v62, s25, 1
-; SI-NEXT: v_writelane_b32 v62, s24, 2
-; SI-NEXT: v_writelane_b32 v62, s23, 3
-; SI-NEXT: v_writelane_b32 v62, s22, 4
-; SI-NEXT: v_writelane_b32 v62, s21, 5
-; SI-NEXT: v_writelane_b32 v62, s18, 6
-; SI-NEXT: v_writelane_b32 v62, s16, 7
; SI-NEXT: v_writelane_b32 v63, s31, 1
; SI-NEXT: v_writelane_b32 v63, s34, 2
; SI-NEXT: v_writelane_b32 v63, s35, 3
@@ -208948,17 +209051,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v63, s50, 10
; SI-NEXT: v_writelane_b32 v63, s51, 11
; SI-NEXT: v_writelane_b32 v63, s52, 12
+; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; SI-NEXT: v_writelane_b32 v63, s53, 13
+; SI-NEXT: v_writelane_b32 v62, s28, 0
; SI-NEXT: v_writelane_b32 v63, s54, 14
+; SI-NEXT: v_writelane_b32 v62, s27, 1
; SI-NEXT: v_writelane_b32 v63, s55, 15
+; SI-NEXT: v_writelane_b32 v62, s26, 2
; SI-NEXT: v_writelane_b32 v63, s64, 16
+; SI-NEXT: v_writelane_b32 v62, s25, 3
; SI-NEXT: v_writelane_b32 v63, s65, 17
+; SI-NEXT: v_writelane_b32 v62, s24, 4
; SI-NEXT: v_writelane_b32 v63, s66, 18
+; SI-NEXT: v_writelane_b32 v62, s23, 5
; SI-NEXT: v_writelane_b32 v63, s67, 19
+; SI-NEXT: v_writelane_b32 v62, s22, 6
; SI-NEXT: v_writelane_b32 v63, s68, 20
+; SI-NEXT: v_writelane_b32 v62, s21, 7
; SI-NEXT: v_writelane_b32 v63, s69, 21
+; SI-NEXT: v_writelane_b32 v62, s20, 8
; SI-NEXT: v_writelane_b32 v63, s70, 22
+; SI-NEXT: v_writelane_b32 v62, s18, 9
; SI-NEXT: v_writelane_b32 v63, s71, 23
+; SI-NEXT: v_writelane_b32 v62, s16, 10
; SI-NEXT: v_writelane_b32 v63, s80, 24
; SI-NEXT: v_writelane_b32 v63, s81, 25
; SI-NEXT: v_writelane_b32 v63, s82, 26
@@ -208970,254 +209085,252 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v63, s96, 32
; SI-NEXT: v_writelane_b32 v63, s97, 33
; SI-NEXT: v_writelane_b32 v63, s98, 34
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v21, v5
; SI-NEXT: v_writelane_b32 v63, s99, 35
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, v26
; SI-NEXT: v_readfirstlane_b32 s15, v16
-; SI-NEXT: v_readfirstlane_b32 s18, v25
+; SI-NEXT: v_readfirstlane_b32 s21, v25
; SI-NEXT: v_readfirstlane_b32 s43, v15
; SI-NEXT: v_readfirstlane_b32 s42, v24
; SI-NEXT: v_readfirstlane_b32 s44, v23
-; SI-NEXT: v_readfirstlane_b32 s49, v12
-; SI-NEXT: v_readfirstlane_b32 s8, v11
-; SI-NEXT: v_readfirstlane_b32 s53, v20
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v62, s4, 8
-; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v62, s4, 9
-; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v62, s4, 10
-; SI-NEXT: v_readfirstlane_b32 s4, v50
+; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: v_writelane_b32 v62, s4, 11
-; SI-NEXT: v_readfirstlane_b32 s79, v52
-; SI-NEXT: v_readfirstlane_b32 s88, v54
-; SI-NEXT: v_readfirstlane_b32 s4, v55
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:176
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168
+; SI-NEXT: v_readfirstlane_b32 s4, v34
+; SI-NEXT: v_writelane_b32 v62, s4, 12
+; SI-NEXT: v_readfirstlane_b32 s52, v37
+; SI-NEXT: v_readfirstlane_b32 s82, v48
+; SI-NEXT: v_readfirstlane_b32 s4, v53
+; SI-NEXT: v_readfirstlane_b32 s79, v50
+; SI-NEXT: v_readfirstlane_b32 s88, v52
+; SI-NEXT: v_writelane_b32 v62, s4, 13
+; SI-NEXT: v_readfirstlane_b32 s77, v55
+; SI-NEXT: v_readfirstlane_b32 s4, v41
+; SI-NEXT: v_readfirstlane_b32 s35, v42
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:136
-; SI-NEXT: v_writelane_b32 v62, s4, 12
-; SI-NEXT: v_readfirstlane_b32 s77, v41
-; SI-NEXT: v_readfirstlane_b32 s4, v42
-; SI-NEXT: v_readfirstlane_b32 s94, v31
-; SI-NEXT: v_readfirstlane_b32 s70, v32
-; SI-NEXT: v_readfirstlane_b32 s51, v33
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:140
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136
+; SI-NEXT: v_readfirstlane_b32 s16, v31
+; SI-NEXT: v_readfirstlane_b32 s26, v32
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s37, v45
-; SI-NEXT: v_readfirstlane_b32 s24, v56
+; SI-NEXT: v_readfirstlane_b32 s76, v45
+; SI-NEXT: v_readfirstlane_b32 s66, v56
; SI-NEXT: v_readfirstlane_b32 s7, v57
; SI-NEXT: v_readfirstlane_b32 s92, v58
-; SI-NEXT: v_readfirstlane_b32 s28, v59
+; SI-NEXT: v_readfirstlane_b32 s27, v59
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80
-; SI-NEXT: v_readfirstlane_b32 s35, v43
-; SI-NEXT: v_readfirstlane_b32 s55, v46
-; SI-NEXT: v_readfirstlane_b32 s68, v35
-; SI-NEXT: v_readfirstlane_b32 s87, v37
+; SI-NEXT: v_readfirstlane_b32 s51, v44
+; SI-NEXT: v_readfirstlane_b32 s55, v47
+; SI-NEXT: v_readfirstlane_b32 s6, v35
+; SI-NEXT: v_readfirstlane_b32 s98, v36
+; SI-NEXT: v_readfirstlane_b32 s18, v38
; SI-NEXT: v_readfirstlane_b32 s67, v39
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s74, v53
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64
-; SI-NEXT: v_readfirstlane_b32 s85, v48
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40
-; SI-NEXT: v_writelane_b32 v62, s4, 13
-; SI-NEXT: v_readfirstlane_b32 s98, v40
+; SI-NEXT: v_readfirstlane_b32 s34, v54
; SI-NEXT: v_readfirstlane_b32 s69, v51
-; SI-NEXT: v_readfirstlane_b32 s21, v36
-; SI-NEXT: v_readfirstlane_b32 s40, v19
-; SI-NEXT: v_readfirstlane_b32 s23, v28
-; SI-NEXT: v_readfirstlane_b32 s34, v27
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v21, v13
-; SI-NEXT: v_mov_b32_e32 v13, v5
-; SI-NEXT: v_readfirstlane_b32 s97, v29
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40
+; SI-NEXT: v_readfirstlane_b32 s87, v40
+; SI-NEXT: v_readfirstlane_b32 s86, v49
+; SI-NEXT: v_writelane_b32 v62, s4, 14
+; SI-NEXT: v_writelane_b32 v62, s17, 15
+; SI-NEXT: v_writelane_b32 v62, s15, 16
+; SI-NEXT: v_writelane_b32 v62, s21, 17
+; SI-NEXT: v_writelane_b32 v62, s43, 18
+; SI-NEXT: v_writelane_b32 v62, s42, 19
+; SI-NEXT: v_writelane_b32 v62, s44, 20
+; SI-NEXT: v_readfirstlane_b32 s53, v12
+; SI-NEXT: v_readfirstlane_b32 s23, v11
+; SI-NEXT: v_readfirstlane_b32 s8, v20
+; SI-NEXT: v_readfirstlane_b32 s48, v19
+; SI-NEXT: v_readfirstlane_b32 s63, v28
+; SI-NEXT: v_readfirstlane_b32 s95, v27
+; SI-NEXT: v_mov_b32_e32 v29, v13
+; SI-NEXT: v_readfirstlane_b32 s97, v26
; SI-NEXT: v_readfirstlane_b32 s80, v18
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v22
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v30
; SI-NEXT: v_readfirstlane_b32 s96, v17
+; SI-NEXT: v_readfirstlane_b32 s65, v10
; SI-NEXT: v_readfirstlane_b32 s64, v9
-; SI-NEXT: v_readfirstlane_b32 s25, v8
+; SI-NEXT: v_readfirstlane_b32 s68, v8
; SI-NEXT: v_readfirstlane_b32 s83, v7
; SI-NEXT: v_readfirstlane_b32 s84, v4
; SI-NEXT: v_readfirstlane_b32 s93, v3
-; SI-NEXT: v_readfirstlane_b32 s76, v1
-; SI-NEXT: v_readfirstlane_b32 s58, v38
-; SI-NEXT: v_readfirstlane_b32 s65, v49
-; SI-NEXT: v_readfirstlane_b32 s62, v54
-; SI-NEXT: v_readfirstlane_b32 s81, v44
-; SI-NEXT: v_readfirstlane_b32 s71, v47
-; SI-NEXT: v_readfirstlane_b32 s38, v60
-; SI-NEXT: v_readfirstlane_b32 s86, v61
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:220
+; SI-NEXT: v_readfirstlane_b32 s90, v2
+; SI-NEXT: v_readfirstlane_b32 s11, v1
+; SI-NEXT: v_readfirstlane_b32 s59, v37
+; SI-NEXT: v_readfirstlane_b32 s94, v50
+; SI-NEXT: v_readfirstlane_b32 s39, v53
+; SI-NEXT: v_readfirstlane_b32 s81, v43
+; SI-NEXT: v_readfirstlane_b32 s71, v46
+; SI-NEXT: v_readfirstlane_b32 s85, v60
+; SI-NEXT: v_readfirstlane_b32 s89, v61
+; SI-NEXT: v_readfirstlane_b32 s49, v33
+; SI-NEXT: v_readfirstlane_b32 s70, v34
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s90, v50
-; SI-NEXT: v_readfirstlane_b32 s31, v52
-; SI-NEXT: v_readfirstlane_b32 s4, v55
+; SI-NEXT: v_readfirstlane_b32 s74, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220
+; SI-NEXT: v_readfirstlane_b32 s91, v52
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32
-; SI-NEXT: v_readfirstlane_b32 s72, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
-; SI-NEXT: v_readfirstlane_b32 s82, v56
-; SI-NEXT: v_readfirstlane_b32 s95, v57
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32
+; SI-NEXT: v_readfirstlane_b32 s37, v56
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s39, v58
-; SI-NEXT: v_readfirstlane_b32 s56, v59
-; SI-NEXT: v_readfirstlane_b32 s57, v41
+; SI-NEXT: v_readfirstlane_b32 s38, v59
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
+; SI-NEXT: v_readfirstlane_b32 s25, v57
+; SI-NEXT: v_readfirstlane_b32 s56, v58
+; SI-NEXT: v_readfirstlane_b32 s57, v55
+; SI-NEXT: v_readfirstlane_b32 s58, v41
; SI-NEXT: v_readfirstlane_b32 s36, v42
-; SI-NEXT: v_readfirstlane_b32 s73, v45
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:284
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:124
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92
-; SI-NEXT: v_readfirstlane_b32 s16, v34
-; SI-NEXT: v_readfirstlane_b32 s48, v32
-; SI-NEXT: v_readfirstlane_b32 s52, v33
-; SI-NEXT: v_writelane_b32 v62, s4, 14
+; SI-NEXT: v_readfirstlane_b32 s40, v45
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:284
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:252
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92
+; SI-NEXT: v_readfirstlane_b32 s75, v32
; SI-NEXT: v_readfirstlane_b32 s47, v35
-; SI-NEXT: v_readfirstlane_b32 s60, v37
-; SI-NEXT: v_readfirstlane_b32 s61, v39
-; SI-NEXT: v_readfirstlane_b32 s89, v43
+; SI-NEXT: v_writelane_b32 v62, s56, 21
+; SI-NEXT: v_writelane_b32 v62, s49, 22
+; SI-NEXT: v_readfirstlane_b32 s72, v38
+; SI-NEXT: v_readfirstlane_b32 s73, v39
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s99, v46
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312
+; SI-NEXT: v_readfirstlane_b32 s22, v44
+; SI-NEXT: v_readfirstlane_b32 s99, v47
+; SI-NEXT: v_writelane_b32 v62, s53, 23
+; SI-NEXT: v_writelane_b32 v62, s70, 24
+; SI-NEXT: v_writelane_b32 v62, s23, 25
+; SI-NEXT: v_writelane_b32 v62, s57, 26
+; SI-NEXT: v_readfirstlane_b32 s54, v51
+; SI-NEXT: v_readfirstlane_b32 s50, v54
+; SI-NEXT: v_readfirstlane_b32 s31, v48
+; SI-NEXT: v_readfirstlane_b32 s78, v49
+; SI-NEXT: v_readfirstlane_b32 s30, v50
+; SI-NEXT: v_readfirstlane_b32 s24, v53
+; SI-NEXT: v_readfirstlane_b32 s28, v40
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_readfirstlane_b32 s20, v43
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_readfirstlane_b32 s45, v46
+; SI-NEXT: v_writelane_b32 v62, s45, 27
+; SI-NEXT: v_writelane_b32 v62, s8, 28
+; SI-NEXT: v_writelane_b32 v62, s58, 29
+; SI-NEXT: v_writelane_b32 v62, s59, 30
+; SI-NEXT: v_writelane_b32 v62, s47, 31
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_readfirstlane_b32 s60, v36
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:312
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:184
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24
-; SI-NEXT: v_readfirstlane_b32 s54, v48
-; SI-NEXT: v_readfirstlane_b32 s50, v53
-; SI-NEXT: v_readfirstlane_b32 s78, v49
-; SI-NEXT: v_readfirstlane_b32 s30, v51
-; SI-NEXT: v_readfirstlane_b32 s66, v54
-; SI-NEXT: v_readfirstlane_b32 s91, v40
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s6, v44
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s4, v10
-; SI-NEXT: v_writelane_b32 v62, s4, 15
-; SI-NEXT: v_readfirstlane_b32 s4, v2
-; SI-NEXT: v_writelane_b32 v62, s4, 16
-; SI-NEXT: v_writelane_b32 v62, s17, 17
-; SI-NEXT: v_writelane_b32 v62, s15, 18
-; SI-NEXT: v_writelane_b32 v62, s18, 19
-; SI-NEXT: v_writelane_b32 v62, s43, 20
-; SI-NEXT: v_writelane_b32 v62, s42, 21
-; SI-NEXT: v_writelane_b32 v62, s44, 22
-; SI-NEXT: v_writelane_b32 v62, s16, 23
-; SI-NEXT: v_writelane_b32 v62, s49, 24
-; SI-NEXT: v_writelane_b32 v62, s8, 25
-; SI-NEXT: v_writelane_b32 v62, s6, 26
-; SI-NEXT: v_readfirstlane_b32 s45, v52
-; SI-NEXT: v_writelane_b32 v62, s56, 27
-; SI-NEXT: v_writelane_b32 v62, s45, 28
-; SI-NEXT: v_writelane_b32 v62, s53, 29
-; SI-NEXT: v_writelane_b32 v62, s94, 30
-; SI-NEXT: v_writelane_b32 v62, s57, 31
-; SI-NEXT: v_writelane_b32 v62, s58, 32
-; SI-NEXT: v_writelane_b32 v62, s47, 33
-; SI-NEXT: v_readfirstlane_b32 s46, v55
-; SI-NEXT: v_writelane_b32 v62, s40, 34
-; SI-NEXT: v_readfirstlane_b32 s59, v47
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24
+; SI-NEXT: v_writelane_b32 v62, s48, 32
+; SI-NEXT: v_writelane_b32 v62, s26, 33
+; SI-NEXT: v_readfirstlane_b32 s46, v60
+; SI-NEXT: v_writelane_b32 v62, s60, 34
+; SI-NEXT: v_readfirstlane_b32 s61, v61
; SI-NEXT: v_writelane_b32 v62, s46, 35
-; SI-NEXT: v_writelane_b32 v62, s59, 36
-; SI-NEXT: v_writelane_b32 v62, s60, 37
+; SI-NEXT: v_writelane_b32 v62, s61, 36
+; SI-NEXT: v_writelane_b32 v62, s72, 37
; SI-NEXT: v_writelane_b32 v62, s36, 38
-; SI-NEXT: v_writelane_b32 v62, s65, 39
-; SI-NEXT: v_writelane_b32 v62, s61, 40
-; SI-NEXT: v_writelane_b32 v62, s73, 41
-; SI-NEXT: v_writelane_b32 v62, s62, 42
-; SI-NEXT: v_writelane_b32 v62, s72, 43
-; SI-NEXT: v_writelane_b32 v62, s23, 44
-; SI-NEXT: v_writelane_b32 v62, s48, 45
-; SI-NEXT: v_writelane_b32 v62, s34, 46
+; SI-NEXT: v_writelane_b32 v62, s94, 39
+; SI-NEXT: v_writelane_b32 v62, s73, 40
+; SI-NEXT: v_writelane_b32 v62, s40, 41
+; SI-NEXT: v_writelane_b32 v62, s39, 42
+; SI-NEXT: v_writelane_b32 v62, s74, 43
+; SI-NEXT: v_writelane_b32 v62, s63, 44
+; SI-NEXT: v_writelane_b32 v62, s75, 45
+; SI-NEXT: v_writelane_b32 v62, s95, 46
; SI-NEXT: v_writelane_b32 v62, s78, 47
; SI-NEXT: v_writelane_b32 v62, s30, 48
; SI-NEXT: v_writelane_b32 v62, s54, 49
; SI-NEXT: v_writelane_b32 v62, s50, 50
-; SI-NEXT: v_writelane_b32 v62, s52, 51
-; SI-NEXT: v_writelane_b32 v62, s82, 52
-; SI-NEXT: v_writelane_b32 v62, s66, 53
-; SI-NEXT: v_readfirstlane_b32 s22, v36
+; SI-NEXT: v_writelane_b32 v62, s25, 51
+; SI-NEXT: v_writelane_b32 v62, s24, 52
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v6
+; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v14
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v56
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58
+; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v59
-; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v56
-; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v60
-; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45
-; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v61
-; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v42
-; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v41
-; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v31
-; SI-NEXT: v_writelane_b32 v62, s91, 54
+; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58
+; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v45
+; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v59
+; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v42
+; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v31
+; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v41
+; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v55
+; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v5
+; SI-NEXT: v_writelane_b32 v62, s28, 53
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB97_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v5, v13
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_readlane_b32 s5, v62, 5
-; SI-NEXT: s_and_b32 s4, s20, 0xff
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_readlane_b32 s4, v62, 8
+; SI-NEXT: v_readlane_b32 s5, v62, 7
+; SI-NEXT: v_mov_b32_e32 v13, v21
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_writelane_b32 v62, s4, 55
-; SI-NEXT: v_readlane_b32 s4, v62, 4
+; SI-NEXT: v_writelane_b32 v62, s4, 54
+; SI-NEXT: v_readlane_b32 s4, v62, 6
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: v_readlane_b32 s5, v62, 3
+; SI-NEXT: v_readlane_b32 s5, v62, 5
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
-; SI-NEXT: s_or_b32 s63, s5, s4
-; SI-NEXT: v_readlane_b32 s4, v62, 6
+; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: v_writelane_b32 v62, s4, 55
+; SI-NEXT: v_readlane_b32 s4, v62, 9
; SI-NEXT: s_and_b32 s5, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_lshl_b32 s9, s19, 24
@@ -209227,27 +209340,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshl_b32 s10, s29, 8
; SI-NEXT: s_or_b32 s4, s5, s10
; SI-NEXT: v_writelane_b32 v62, s4, 56
-; SI-NEXT: s_and_b32 s5, s76, 0xff
-; SI-NEXT: v_readlane_b32 s10, v62, 16
+; SI-NEXT: v_writelane_b32 v62, s37, 57
+; SI-NEXT: s_and_b32 s5, s11, 0xff
+; SI-NEXT: s_mov_b32 s37, s11
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s11, s10, 24
+; SI-NEXT: s_lshl_b32 s11, s90, 24
+; SI-NEXT: v_readlane_b32 s4, v62, 2
; SI-NEXT: s_or_b32 s5, s11, s5
-; SI-NEXT: s_and_b32 s11, s26, 0xff
+; SI-NEXT: s_and_b32 s11, s4, 0xff
+; SI-NEXT: v_readlane_b32 s4, v62, 1
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s12, s27, 24
+; SI-NEXT: s_lshl_b32 s12, s4, 24
; SI-NEXT: s_or_b32 s14, s12, s11
; SI-NEXT: s_and_b32 s11, s83, 0xff
-; SI-NEXT: s_lshl_b32 s12, s25, 8
-; SI-NEXT: s_or_b32 s10, s11, s12
-; SI-NEXT: v_writelane_b32 v62, s10, 57
+; SI-NEXT: s_lshl_b32 s12, s68, 8
+; SI-NEXT: s_or_b32 s4, s11, s12
; SI-NEXT: s_and_b32 s11, s64, 0xff
-; SI-NEXT: v_readlane_b32 s10, v62, 15
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s13, s10, 24
+; SI-NEXT: s_lshl_b32 s13, s65, 24
; SI-NEXT: s_or_b32 s41, s13, s11
; SI-NEXT: s_and_b32 s11, s43, 0xff
; SI-NEXT: s_lshl_b32 s13, s15, 8
-; SI-NEXT: s_or_b32 s10, s11, s13
+; SI-NEXT: v_writelane_b32 v62, s4, 58
+; SI-NEXT: s_or_b32 s4, s11, s13
; SI-NEXT: s_and_b32 s11, s96, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
; SI-NEXT: s_lshl_b32 s15, s80, 24
@@ -209255,266 +209370,269 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s11, s44, 0xff
; SI-NEXT: s_lshl_b32 s15, s42, 8
; SI-NEXT: s_or_b32 s13, s11, s15
-; SI-NEXT: s_and_b32 s11, s18, 0xff
+; SI-NEXT: s_and_b32 s11, s21, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
; SI-NEXT: s_lshl_b32 s15, s97, 24
; SI-NEXT: s_or_b32 s44, s15, s11
-; SI-NEXT: s_and_b32 s11, s59, 0xff
+; SI-NEXT: s_and_b32 s11, s61, 0xff
; SI-NEXT: s_lshl_b32 s15, s46, 8
; SI-NEXT: s_or_b32 s12, s11, s15
; SI-NEXT: s_and_b32 s11, s45, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s6, 24
+; SI-NEXT: s_lshl_b32 s15, s20, 24
; SI-NEXT: s_or_b32 s45, s15, s11
; SI-NEXT: s_and_b32 s11, s30, 0xff
; SI-NEXT: s_lshl_b32 s15, s78, 8
-; SI-NEXT: v_writelane_b32 v62, s10, 58
; SI-NEXT: s_or_b32 s10, s11, s15
; SI-NEXT: s_and_b32 s11, s99, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s89, 24
+; SI-NEXT: s_lshl_b32 s15, s22, 24
; SI-NEXT: s_or_b32 s46, s15, s11
-; SI-NEXT: s_and_b32 s11, s61, 0xff
-; SI-NEXT: s_lshl_b32 s15, s60, 8
-; SI-NEXT: s_or_b32 s6, s11, s15
-; SI-NEXT: s_and_b32 s11, s22, 0xff
+; SI-NEXT: s_and_b32 s11, s73, 0xff
+; SI-NEXT: s_lshl_b32 s15, s72, 8
+; SI-NEXT: v_writelane_b32 v62, s4, 59
+; SI-NEXT: s_or_b32 s4, s11, s15
+; SI-NEXT: s_and_b32 s11, s60, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
; SI-NEXT: s_lshl_b32 s15, s47, 24
; SI-NEXT: s_or_b32 s47, s15, s11
-; SI-NEXT: s_and_b32 s11, s57, 0xff
-; SI-NEXT: s_lshl_b32 s15, s56, 8
-; SI-NEXT: v_writelane_b32 v62, s6, 59
-; SI-NEXT: s_or_b32 s6, s11, s15
-; SI-NEXT: s_and_b32 s11, s39, 0xff
-; SI-NEXT: v_writelane_b32 v62, s6, 60
+; SI-NEXT: s_and_b32 s11, s58, 0xff
+; SI-NEXT: s_lshl_b32 s15, s57, 8
+; SI-NEXT: s_mov_b32 s62, s16
+; SI-NEXT: s_or_b32 s16, s11, s15
+; SI-NEXT: s_and_b32 s11, s38, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s95, 24
+; SI-NEXT: s_lshl_b32 s15, s56, 24
; SI-NEXT: s_or_b32 s56, s15, s11
-; SI-NEXT: s_and_b32 s11, s48, 0xff
-; SI-NEXT: s_lshl_b32 s15, s72, 8
-; SI-NEXT: v_readlane_b32 s6, v62, 14
-; SI-NEXT: s_or_b32 s48, s11, s15
-; SI-NEXT: s_and_b32 s11, s6, 0xff
+; SI-NEXT: s_and_b32 s11, s75, 0xff
+; SI-NEXT: s_lshl_b32 s15, s74, 8
+; SI-NEXT: s_or_b32 s73, s11, s15
+; SI-NEXT: s_and_b32 s11, s91, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
; SI-NEXT: s_lshl_b32 s15, s31, 24
; SI-NEXT: s_or_b32 vcc_lo, s15, s11
-; SI-NEXT: s_and_b32 s11, s86, 0xff
-; SI-NEXT: s_lshl_b32 s15, s38, 8
+; SI-NEXT: s_and_b32 s11, s89, 0xff
+; SI-NEXT: s_lshl_b32 s15, s85, 8
; SI-NEXT: s_or_b32 s72, s11, s15
; SI-NEXT: s_and_b32 s11, s71, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
; SI-NEXT: s_lshl_b32 s15, s81, 24
; SI-NEXT: s_or_b32 vcc_hi, s15, s11
-; SI-NEXT: s_and_b32 s11, s58, 0xff
-; SI-NEXT: s_lshl_b32 s15, s85, 8
+; SI-NEXT: s_and_b32 s11, s59, 0xff
+; SI-NEXT: s_lshl_b32 s15, s86, 8
; SI-NEXT: s_or_b32 s57, s11, s15
; SI-NEXT: s_and_b32 s11, s69, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s74, 24
-; SI-NEXT: v_writelane_b32 v62, s74, 61
+; SI-NEXT: s_lshl_b32 s15, s34, 24
; SI-NEXT: s_or_b32 s74, s15, s11
-; SI-NEXT: s_and_b32 s11, s87, 0xff
-; SI-NEXT: s_lshl_b32 s15, s21, 8
+; SI-NEXT: s_and_b32 s11, s18, 0xff
+; SI-NEXT: s_lshl_b32 s15, s98, 8
; SI-NEXT: s_or_b32 s58, s11, s15
-; SI-NEXT: s_and_b32 s11, s68, 0xff
+; SI-NEXT: s_and_b32 s11, s6, 0xff
+; SI-NEXT: v_writelane_b32 v62, s4, 60
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s28, 24
+; SI-NEXT: s_lshl_b32 s15, s27, 24
+; SI-NEXT: v_writelane_b32 v62, s34, 61
; SI-NEXT: s_or_b32 s75, s15, s11
-; SI-NEXT: s_and_b32 s11, s24, 0xff
+; SI-NEXT: s_and_b32 s11, s66, 0xff
; SI-NEXT: s_lshl_b32 s15, s55, 8
-; SI-NEXT: v_writelane_b32 v62, s25, 62
+; SI-NEXT: v_writelane_b32 v62, s6, 62
; SI-NEXT: s_or_b32 s59, s11, s15
-; SI-NEXT: s_and_b32 s11, s37, 0xff
+; SI-NEXT: s_and_b32 s11, s76, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 16
; SI-NEXT: s_lshl_b32 s15, s51, 24
-; SI-NEXT: v_readlane_b32 s4, v62, 13
-; SI-NEXT: s_mov_b32 s18, s21
-; SI-NEXT: s_mov_b32 s21, s97
-; SI-NEXT: s_mov_b32 s97, s37
-; SI-NEXT: s_mov_b32 s37, s76
+; SI-NEXT: v_writelane_b32 v62, s85, 63
+; SI-NEXT: s_mov_b32 s4, s97
+; SI-NEXT: s_mov_b32 s97, s76
; SI-NEXT: s_or_b32 s76, s15, s11
+; SI-NEXT: v_readlane_b32 s15, v62, 14
; SI-NEXT: s_and_b32 s11, s35, 0xff
-; SI-NEXT: s_lshl_b32 s15, s4, 8
+; SI-NEXT: s_lshl_b32 s15, s15, 8
; SI-NEXT: s_or_b32 s60, s11, s15
; SI-NEXT: s_and_b32 s11, s77, 0xff
-; SI-NEXT: v_readlane_b32 s4, v62, 12
+; SI-NEXT: v_readlane_b32 s15, v62, 13
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s4, 24
-; SI-NEXT: v_readlane_b32 s4, v62, 11
-; SI-NEXT: s_mov_b32 s6, s95
-; SI-NEXT: s_mov_b32 s95, s39
-; SI-NEXT: s_mov_b32 s39, s89
-; SI-NEXT: s_mov_b32 s89, s99
+; SI-NEXT: s_lshl_b32 s15, s15, 24
+; SI-NEXT: s_mov_b32 s21, s20
+; SI-NEXT: s_mov_b32 s20, s38
+; SI-NEXT: s_mov_b32 s38, s99
; SI-NEXT: s_mov_b32 s99, s83
; SI-NEXT: s_mov_b32 s83, s55
; SI-NEXT: s_mov_b32 s55, s64
; SI-NEXT: s_mov_b32 s64, s35
; SI-NEXT: s_mov_b32 s35, s77
; SI-NEXT: s_or_b32 s77, s15, s11
-; SI-NEXT: s_and_b32 s11, s4, 0xff
-; SI-NEXT: v_readlane_b32 s4, v62, 10
-; SI-NEXT: s_lshl_b32 s15, s4, 8
-; SI-NEXT: v_readlane_b32 s4, v62, 9
+; SI-NEXT: s_and_b32 s11, s82, 0xff
+; SI-NEXT: s_lshl_b32 s15, s52, 8
; SI-NEXT: s_or_b32 s61, s11, s15
-; SI-NEXT: s_and_b32 s11, s4, 0xff
-; SI-NEXT: v_readlane_b32 s4, v62, 8
+; SI-NEXT: v_readlane_b32 s11, v62, 12
+; SI-NEXT: s_and_b32 s11, s11, 0xff
+; SI-NEXT: v_readlane_b32 s15, v62, 11
; SI-NEXT: s_lshl_b32 s11, s11, 16
-; SI-NEXT: s_lshl_b32 s15, s4, 24
+; SI-NEXT: s_lshl_b32 s15, s15, 24
; SI-NEXT: s_or_b32 s78, s15, s11
-; SI-NEXT: v_readlane_b32 s11, v62, 7
+; SI-NEXT: v_readlane_b32 s11, v62, 10
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_lshl_b32 s15, s17, 8
; SI-NEXT: s_or_b32 s11, s11, s15
; SI-NEXT: s_and_b32 s11, s11, 0xffff
+; SI-NEXT: s_mov_b32 s30, s18
+; SI-NEXT: s_mov_b32 s18, s89
+; SI-NEXT: s_mov_b32 s89, s98
+; SI-NEXT: s_mov_b32 s98, s96
+; SI-NEXT: s_mov_b32 s96, s66
+; SI-NEXT: s_mov_b32 s66, s82
+; SI-NEXT: s_mov_b32 s82, s52
; SI-NEXT: v_mov_b32_e32 v51, s9
-; SI-NEXT: s_or_b32 s17, s11, s9
-; SI-NEXT: v_readlane_b32 s9, v62, 2
-; SI-NEXT: v_readlane_b32 s11, v62, 1
+; SI-NEXT: s_or_b32 s52, s11, s9
+; SI-NEXT: v_readlane_b32 s9, v62, 4
+; SI-NEXT: v_readlane_b32 s11, v62, 3
; SI-NEXT: s_and_b32 s9, s9, 0xff
; SI-NEXT: s_lshl_b32 s15, s11, 8
; SI-NEXT: s_or_b32 s9, s9, s15
; SI-NEXT: s_and_b32 s9, s9, 0xffff
-; SI-NEXT: s_mov_b32 s4, s96
-; SI-NEXT: s_mov_b32 s96, s24
; SI-NEXT: v_mov_b32_e32 v52, s14
-; SI-NEXT: s_or_b32 s24, s9, s14
+; SI-NEXT: s_or_b32 s17, s9, s14
; SI-NEXT: s_and_b32 s14, s93, 0xff
; SI-NEXT: s_lshl_b32 s15, s84, 8
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v53, v6, v1
+; SI-NEXT: v_or_b32_e32 v53, v60, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v50, s14, v53
-; SI-NEXT: s_and_b32 s14, s8, 0xff
-; SI-NEXT: s_lshl_b32 s15, s49, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v21
+; SI-NEXT: s_and_b32 s14, s23, 0xff
+; SI-NEXT: s_lshl_b32 s15, s53, 8
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v29
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v54, v14, v1
+; SI-NEXT: v_or_b32_e32 v54, v61, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v17, s14, v54
-; SI-NEXT: s_and_b32 s14, s40, 0xff
-; SI-NEXT: s_lshl_b32 s15, s53, 8
+; SI-NEXT: s_and_b32 s14, s48, 0xff
+; SI-NEXT: s_lshl_b32 s15, s8, 8
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v29
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v19
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v55, v18, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v16, s14, v55
-; SI-NEXT: s_and_b32 s14, s34, 0xff
-; SI-NEXT: s_lshl_b32 s15, s23, 8
+; SI-NEXT: s_and_b32 s14, s95, 0xff
+; SI-NEXT: s_lshl_b32 s15, s63, 8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v21
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v40, v19, v1
+; SI-NEXT: v_or_b32_e32 v40, v6, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v15, s14, v40
-; SI-NEXT: s_and_b32 s14, s91, 0xff
-; SI-NEXT: s_lshl_b32 s15, s66, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v43
+; SI-NEXT: s_and_b32 s14, s28, 0xff
+; SI-NEXT: s_lshl_b32 s15, s24, 8
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v44
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v41, v22, v1
+; SI-NEXT: v_or_b32_e32 v41, v20, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v12, s14, v41
; SI-NEXT: s_and_b32 s14, s50, 0xff
; SI-NEXT: s_lshl_b32 s15, s54, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v32
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v47
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v42, v23, v1
+; SI-NEXT: v_or_b32_e32 v42, v22, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v11, s14, v42
-; SI-NEXT: s_and_b32 s14, s73, 0xff
+; SI-NEXT: s_and_b32 s14, s40, 0xff
; SI-NEXT: s_lshl_b32 s15, s36, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v46
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v34
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v28, v59, v1
+; SI-NEXT: v_or_b32_e32 v59, v23, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
-; SI-NEXT: v_or_b32_e32 v10, s14, v28
-; SI-NEXT: s_and_b32 s14, s82, 0xff
-; SI-NEXT: s_lshl_b32 s15, s52, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v26
+; SI-NEXT: v_readlane_b32 s8, v62, 57
+; SI-NEXT: v_or_b32_e32 v10, s14, v59
+; SI-NEXT: s_and_b32 s14, s25, 0xff
+; SI-NEXT: s_lshl_b32 s15, s8, 8
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v35
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v60, v24, v1
+; SI-NEXT: v_mov_b32_e32 v25, v20
+; SI-NEXT: v_mov_b32_e32 v20, v60
+; SI-NEXT: v_or_b32_e32 v60, v14, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: v_or_b32_e32 v9, s14, v60
-; SI-NEXT: s_and_b32 s14, s90, 0xff
-; SI-NEXT: s_lshl_b32 s15, s16, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v35
+; SI-NEXT: s_and_b32 s14, s70, 0xff
+; SI-NEXT: s_lshl_b32 s15, s49, 8
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v36
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v31, v44, v1
+; SI-NEXT: v_or_b32_e32 v5, v43, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
-; SI-NEXT: v_or_b32_e32 v8, s14, v31
-; SI-NEXT: s_and_b32 s14, s62, 0xff
-; SI-NEXT: s_lshl_b32 s15, s65, 8
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v37
+; SI-NEXT: v_or_b32_e32 v8, s14, v5
+; SI-NEXT: s_and_b32 s14, s39, 0xff
+; SI-NEXT: s_lshl_b32 s15, s94, 8
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v32
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v61, v45, v1
+; SI-NEXT: v_or_b32_e32 v31, v45, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
-; SI-NEXT: v_or_b32_e32 v7, s14, v61
-; SI-NEXT: s_and_b32 s14, s98, 0xff
+; SI-NEXT: v_or_b32_e32 v7, s14, v31
+; SI-NEXT: s_and_b32 s14, s87, 0xff
; SI-NEXT: s_lshl_b32 s15, s67, 8
; SI-NEXT: v_and_b32_e32 v1, 0xff, v38
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: v_or_b32_e32 v6, v47, v1
+; SI-NEXT: v_mov_b32_e32 v26, v34
+; SI-NEXT: v_mov_b32_e32 v34, v22
+; SI-NEXT: v_mov_b32_e32 v22, v61
+; SI-NEXT: v_or_b32_e32 v61, v46, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
-; SI-NEXT: v_or_b32_e32 v4, s14, v6
+; SI-NEXT: v_or_b32_e32 v4, s14, v61
; SI-NEXT: s_and_b32 s14, s92, 0xff
; SI-NEXT: s_lshl_b32 s15, s7, 8
; SI-NEXT: v_and_b32_e32 v1, 0xff, v33
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_readlane_b32 s8, v62, 55
+; SI-NEXT: v_readlane_b32 s8, v62, 54
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_mov_b32_e32 v22, v14
-; SI-NEXT: v_or_b32_e32 v14, v56, v1
+; SI-NEXT: v_or_b32_e32 v6, v56, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: s_and_b32 s8, s8, 0xffff
-; SI-NEXT: v_or_b32_e32 v2, s14, v14
-; SI-NEXT: s_and_b32 s14, s70, 0xff
-; SI-NEXT: s_lshl_b32 s15, s94, 8
+; SI-NEXT: v_readlane_b32 s11, v62, 55
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_or_b32_e32 v36, s14, v6
+; SI-NEXT: s_and_b32 s14, s26, 0xff
+; SI-NEXT: s_lshl_b32 s15, s62, 8
; SI-NEXT: v_and_b32_e32 v1, 0xff, v39
-; SI-NEXT: s_or_b32 s42, s8, s63
+; SI-NEXT: s_or_b32 s42, s8, s11
; SI-NEXT: v_readlane_b32 s8, v62, 56
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_and_b32 s8, s8, 0xffff
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_mov_b32_e32 v32, v23
-; SI-NEXT: v_mov_b32_e32 v23, v18
-; SI-NEXT: v_or_b32_e32 v18, v57, v1
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_or_b32_e32 v14, v57, v1
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: s_or_b32 s40, s8, s5
-; SI-NEXT: v_readlane_b32 s8, v62, 57
-; SI-NEXT: v_or_b32_e32 v1, s14, v18
+; SI-NEXT: v_readlane_b32 s8, v62, 58
+; SI-NEXT: v_or_b32_e32 v1, s14, v14
; SI-NEXT: s_and_b32 s14, s88, 0xff
; SI-NEXT: s_lshl_b32 s15, s79, 8
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v34
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v37
; SI-NEXT: s_and_b32 s8, s8, 0xffff
-; SI-NEXT: v_readlane_b32 s9, v62, 60
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_or_b32 s15, s8, s41
-; SI-NEXT: v_readlane_b32 s8, v62, 58
-; SI-NEXT: s_and_b32 s16, s9, 0xffff
-; SI-NEXT: v_mov_b32_e32 v27, v26
-; SI-NEXT: v_mov_b32_e32 v26, v24
-; SI-NEXT: v_mov_b32_e32 v24, v19
-; SI-NEXT: v_or_b32_e32 v19, v58, v3
+; SI-NEXT: v_readlane_b32 s8, v62, 59
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
+; SI-NEXT: v_mov_b32_e32 v27, v35
+; SI-NEXT: v_mov_b32_e32 v35, v23
+; SI-NEXT: v_mov_b32_e32 v23, v18
+; SI-NEXT: v_or_b32_e32 v18, v58, v3
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: s_and_b32 s8, s8, 0xffff
; SI-NEXT: s_or_b32 s36, s16, s56
-; SI-NEXT: s_and_b32 s16, s48, 0xffff
-; SI-NEXT: v_or_b32_e32 v3, s14, v19
+; SI-NEXT: s_and_b32 s16, s73, 0xffff
+; SI-NEXT: v_or_b32_e32 v3, s14, v18
; SI-NEXT: s_or_b32 s14, s8, s43
; SI-NEXT: s_and_b32 s8, s13, 0xffff
; SI-NEXT: s_or_b32 s53, s16, vcc_lo
@@ -209528,49 +209646,46 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s49, s16, s74
; SI-NEXT: s_and_b32 s16, s58, 0xffff
; SI-NEXT: s_or_b32 s10, s8, s46
-; SI-NEXT: v_readlane_b32 s8, v62, 59
+; SI-NEXT: v_readlane_b32 s8, v62, 60
; SI-NEXT: s_or_b32 s48, s16, s75
; SI-NEXT: s_and_b32 s16, s59, 0xffff
; SI-NEXT: s_and_b32 s8, s8, 0xffff
-; SI-NEXT: s_or_b32 s11, s16, s76
+; SI-NEXT: s_or_b32 s39, s16, s76
; SI-NEXT: s_and_b32 s16, s60, 0xffff
; SI-NEXT: s_and_b32 s23, s61, 0xffff
-; SI-NEXT: s_mov_b32 s30, s87
-; SI-NEXT: s_mov_b32 s87, s85
; SI-NEXT: s_or_b32 s8, s8, s47
-; SI-NEXT: s_or_b32 s9, s16, s77
-; SI-NEXT: s_or_b32 s16, s23, s78
-; SI-NEXT: v_mov_b32_e32 v36, v35
-; SI-NEXT: v_mov_b32_e32 v30, v37
-; SI-NEXT: v_mov_b32_e32 v35, v45
-; SI-NEXT: v_mov_b32_e32 v20, v47
-; SI-NEXT: v_mov_b32_e32 v49, v56
-; SI-NEXT: v_mov_b32_e32 v48, v39
-; SI-NEXT: v_mov_b32_e32 v39, v57
-; SI-NEXT: v_mov_b32_e32 v25, v58
+; SI-NEXT: s_or_b32 s70, s16, s77
+; SI-NEXT: s_or_b32 s9, s23, s78
+; SI-NEXT: v_mov_b32_e32 v24, v45
+; SI-NEXT: v_mov_b32_e32 v48, v46
+; SI-NEXT: v_mov_b32_e32 v30, v56
+; SI-NEXT: v_mov_b32_e32 v49, v57
+; SI-NEXT: v_mov_b32_e32 v2, v58
; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16
; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16
; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16
; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16
; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16
-; SI-NEXT: v_alignbit_b32 v42, s36, v28, 16
+; SI-NEXT: v_alignbit_b32 v42, s36, v59, 16
; SI-NEXT: v_alignbit_b32 v41, s53, v60, 16
-; SI-NEXT: v_alignbit_b32 v40, s94, v31, 16
-; SI-NEXT: v_alignbit_b32 v55, s49, v61, 16
-; SI-NEXT: v_alignbit_b32 v54, s48, v6, 16
-; SI-NEXT: v_alignbit_b32 v53, s11, v14, 16
-; SI-NEXT: v_mov_b32_e32 v14, v22
-; SI-NEXT: v_alignbit_b32 v52, s9, v18, 16
+; SI-NEXT: v_mov_b32_e32 v60, v20
+; SI-NEXT: v_mov_b32_e32 v20, v25
+; SI-NEXT: v_alignbit_b32 v40, s94, v5, 16
+; SI-NEXT: v_alignbit_b32 v55, s49, v31, 16
+; SI-NEXT: v_alignbit_b32 v54, s48, v61, 16
+; SI-NEXT: v_mov_b32_e32 v61, v22
+; SI-NEXT: v_mov_b32_e32 v22, v34
+; SI-NEXT: v_alignbit_b32 v53, s39, v6, 16
+; SI-NEXT: s_mov_b32 s16, s62
+; SI-NEXT: v_alignbit_b32 v52, s70, v14, 16
+; SI-NEXT: v_alignbit_b32 v51, s9, v18, 16
; SI-NEXT: v_mov_b32_e32 v18, v23
-; SI-NEXT: v_alignbit_b32 v51, s16, v19, 16
-; SI-NEXT: v_mov_b32_e32 v19, v24
-; SI-NEXT: v_mov_b32_e32 v24, v26
-; SI-NEXT: s_lshr_b32 s73, s63, 16
+; SI-NEXT: v_mov_b32_e32 v23, v35
+; SI-NEXT: s_lshr_b32 s73, s11, 16
; SI-NEXT: s_lshr_b32 s72, s5, 16
; SI-NEXT: s_lshr_b32 s63, s41, 16
; SI-NEXT: s_lshr_b32 s62, s43, 16
@@ -209582,87 +209697,85 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16
; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16
; SI-NEXT: s_lshr_b32 s46, s74, 16
-; SI-NEXT: v_readlane_b32 s25, v62, 62
-; SI-NEXT: v_readlane_b32 s74, v62, 61
+; SI-NEXT: v_readlane_b32 s34, v62, 61
; SI-NEXT: s_lshr_b32 s45, s75, 16
+; SI-NEXT: v_readlane_b32 s6, v62, 62
; SI-NEXT: s_lshr_b32 s44, s76, 16
-; SI-NEXT: s_mov_b32 s76, s37
-; SI-NEXT: s_mov_b32 s37, s97
-; SI-NEXT: s_mov_b32 s97, s21
-; SI-NEXT: s_mov_b32 s21, s18
-; SI-NEXT: s_mov_b32 s18, s17
-; SI-NEXT: s_mov_b32 s85, s87
-; SI-NEXT: s_mov_b32 s87, s30
-; SI-NEXT: s_mov_b32 s17, s24
+; SI-NEXT: s_mov_b32 s11, s37
+; SI-NEXT: v_readlane_b32 s37, v62, 57
+; SI-NEXT: s_mov_b32 s76, s97
+; SI-NEXT: s_mov_b32 s97, s4
+; SI-NEXT: v_readlane_b32 s85, v62, 63
; SI-NEXT: s_lshr_b32 s43, s77, 16
; SI-NEXT: s_mov_b32 s77, s35
; SI-NEXT: s_mov_b32 s35, s64
; SI-NEXT: s_mov_b32 s64, s55
; SI-NEXT: s_mov_b32 s55, s83
; SI-NEXT: s_mov_b32 s83, s99
-; SI-NEXT: s_mov_b32 s99, s89
-; SI-NEXT: s_mov_b32 s89, s39
-; SI-NEXT: s_mov_b32 s39, s95
-; SI-NEXT: s_mov_b32 s95, s6
+; SI-NEXT: s_mov_b32 s99, s38
+; SI-NEXT: s_mov_b32 s38, s20
+; SI-NEXT: s_mov_b32 s20, s21
+; SI-NEXT: s_mov_b32 s21, s52
; SI-NEXT: s_lshr_b32 s41, s78, 16
-; SI-NEXT: s_mov_b32 s24, s96
-; SI-NEXT: s_mov_b32 s96, s4
+; SI-NEXT: s_mov_b32 s52, s82
+; SI-NEXT: s_mov_b32 s82, s66
+; SI-NEXT: s_mov_b32 s66, s96
+; SI-NEXT: s_mov_b32 s96, s98
+; SI-NEXT: s_mov_b32 s98, s89
+; SI-NEXT: s_mov_b32 s89, s18
+; SI-NEXT: s_mov_b32 s18, s30
+; SI-NEXT: v_mov_b32_e32 v6, v19
+; SI-NEXT: v_mov_b32_e32 v14, v21
; SI-NEXT: s_cbranch_execnz .LBB97_3
; SI-NEXT: .LBB97_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT: v_mov_b32_e32 v6, v5
-; SI-NEXT: v_mov_b32_e32 v5, v27
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_add_i32 s4, s88, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s79, 8
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34
+; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v1, v25, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_or_b32_e32 v1, s4, v1
-; SI-NEXT: v_readlane_b32 s4, v62, 11
-; SI-NEXT: s_add_i32 s4, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 10
-; SI-NEXT: v_readlane_b32 s6, v62, 9
+; SI-NEXT: s_add_i32 s4, s82, 3
+; SI-NEXT: v_readlane_b32 s8, v62, 12
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_add_i32 s8, s6, 3
+; SI-NEXT: s_lshl_b32 s5, s52, 8
+; SI-NEXT: s_add_i32 s8, s8, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_readlane_b32 s5, v62, 8
+; SI-NEXT: v_readlane_b32 s5, v62, 11
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_add_i32 s70, s70, 3
-; SI-NEXT: v_readlane_b32 s6, v62, 30
; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: v_readlane_b32 s5, v62, 33
+; SI-NEXT: s_add_i32 s70, s5, 3
; SI-NEXT: s_and_b32 s5, s70, 0xff
-; SI-NEXT: s_lshl_b32 s8, s6, 8
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48
+; SI-NEXT: s_lshl_b32 s8, s16, 8
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v39
; SI-NEXT: s_or_b32 s5, s8, s5
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: s_addk_i32 s5, 0x300
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_and_b32 s5, s5, 0xffff
-; SI-NEXT: v_or_b32_e32 v2, v39, v2
+; SI-NEXT: v_or_b32_e32 v2, v49, v2
; SI-NEXT: v_or_b32_e32 v2, s5, v2
; SI-NEXT: s_add_i32 s5, s35, 3
-; SI-NEXT: v_readlane_b32 s6, v62, 13
+; SI-NEXT: v_readlane_b32 s8, v62, 14
; SI-NEXT: s_and_b32 s5, s5, 0xff
-; SI-NEXT: s_lshl_b32 s8, s6, 8
+; SI-NEXT: s_lshl_b32 s8, s8, 8
; SI-NEXT: s_add_i32 s9, s77, 3
; SI-NEXT: s_or_b32 s5, s8, s5
-; SI-NEXT: v_readlane_b32 s6, v62, 12
+; SI-NEXT: v_readlane_b32 s8, v62, 13
; SI-NEXT: s_and_b32 s9, s9, 0xff
-; SI-NEXT: s_lshl_b32 s8, s6, 24
+; SI-NEXT: s_lshl_b32 s8, s8, 24
; SI-NEXT: s_lshl_b32 s9, s9, 16
; SI-NEXT: s_addk_i32 s5, 0x300
; SI-NEXT: s_or_b32 s8, s8, s9
@@ -209670,9 +209783,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s5, s8, s5
; SI-NEXT: s_add_i32 s79, s92, 3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1
-; SI-NEXT: s_add_i32 s16, s4, 0x3000000
+; SI-NEXT: s_add_i32 s9, s4, 0x3000000
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2
-; SI-NEXT: s_add_i32 s9, s5, 0x3000000
+; SI-NEXT: s_add_i32 s70, s5, 0x3000000
; SI-NEXT: s_and_b32 s4, s79, 0xff
; SI-NEXT: s_lshl_b32 s5, s7, 8
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33
@@ -209681,16 +209794,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v2, v49, v2
+; SI-NEXT: v_or_b32_e32 v2, v30, v2
; SI-NEXT: v_or_b32_e32 v2, s4, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: s_add_i32 s4, s24, 3
+; SI-NEXT: s_add_i32 s4, s66, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s55, 8
-; SI-NEXT: s_add_i32 s8, s37, 3
+; SI-NEXT: s_add_i32 s8, s76, 3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
@@ -209699,8 +209808,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_add_i32 s52, s98, 3
-; SI-NEXT: s_add_i32 s11, s4, 0x3000000
+; SI-NEXT: s_add_i32 s52, s87, 3
+; SI-NEXT: s_add_i32 s39, s4, 0x3000000
; SI-NEXT: s_and_b32 s4, s52, 0xff
; SI-NEXT: s_lshl_b32 s5, s67, 8
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38
@@ -209709,64 +209818,68 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v4, v20, v4
-; SI-NEXT: s_add_i32 s30, s87, 3
+; SI-NEXT: v_or_b32_e32 v4, v48, v4
+; SI-NEXT: s_add_i32 s30, s18, 3
; SI-NEXT: v_or_b32_e32 v4, s4, v4
; SI-NEXT: s_and_b32 s4, s30, 0xff
-; SI-NEXT: s_lshl_b32 s5, s21, 8
-; SI-NEXT: s_add_i32 s8, s68, 3
+; SI-NEXT: s_lshl_b32 s5, s98, 8
+; SI-NEXT: s_add_i32 s8, s6, 3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s28, 24
+; SI-NEXT: s_lshl_b32 s5, s27, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s48, s4, 0x3000000
; SI-NEXT: v_readlane_b32 s4, v62, 42
-; SI-NEXT: v_mov_b32_e32 v22, v30
; SI-NEXT: s_add_i32 s87, s4, 3
; SI-NEXT: v_readlane_b32 s5, v62, 39
; SI-NEXT: s_and_b32 s4, s87, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v7, v35, v7
-; SI-NEXT: v_or_b32_e32 v7, s4, v7
-; SI-NEXT: v_readlane_b32 s4, v62, 32
+; SI-NEXT: v_or_b32_e32 v5, v24, v5
+; SI-NEXT: v_or_b32_e32 v5, s4, v5
+; SI-NEXT: v_readlane_b32 s4, v62, 30
; SI-NEXT: s_add_i32 s67, s4, 3
; SI-NEXT: s_and_b32 s4, s67, 0xff
-; SI-NEXT: s_lshl_b32 s5, s85, 8
+; SI-NEXT: s_lshl_b32 s5, s86, 8
; SI-NEXT: s_add_i32 s8, s69, 3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s74, 24
+; SI-NEXT: s_lshl_b32 s5, s34, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_add_i32 s50, s90, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 23
; SI-NEXT: s_add_i32 s49, s4, 0x3000000
+; SI-NEXT: v_readlane_b32 s4, v62, 24
+; SI-NEXT: v_mov_b32_e32 v25, v28
+; SI-NEXT: s_add_i32 s50, s4, 3
+; SI-NEXT: v_readlane_b32 s5, v62, 22
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v5
; SI-NEXT: s_and_b32 s4, s50, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25
; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: s_addk_i32 s4, 0x300
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_add_i32 s94, s86, 3
-; SI-NEXT: v_or_b32_e32 v8, s4, v8
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v5, v8, v5
+; SI-NEXT: s_add_i32 s94, s89, 3
+; SI-NEXT: v_or_b32_e32 v5, s4, v5
; SI-NEXT: s_and_b32 s4, s94, 0xff
-; SI-NEXT: s_lshl_b32 s5, s38, 8
+; SI-NEXT: s_lshl_b32 s5, s85, 8
; SI-NEXT: s_add_i32 s8, s71, 3
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
@@ -209776,25 +209889,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s94, s4, 0x3000000
-; SI-NEXT: v_readlane_b32 s4, v62, 52
+; SI-NEXT: v_readlane_b32 s4, v62, 51
+; SI-NEXT: v_mov_b32_e32 v21, v27
; SI-NEXT: s_add_i32 s18, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 51
+; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v5
; SI-NEXT: s_and_b32 s4, s18, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: s_lshl_b32 s5, s37, 8
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v9, v24, v9
-; SI-NEXT: v_or_b32_e32 v9, s4, v9
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v5, v9, v5
+; SI-NEXT: v_or_b32_e32 v5, s4, v5
; SI-NEXT: v_readlane_b32 s4, v62, 45
; SI-NEXT: s_add_i32 s98, s4, 3
; SI-NEXT: v_readlane_b32 s5, v62, 43
-; SI-NEXT: v_readlane_b32 s6, v62, 14
; SI-NEXT: s_and_b32 s4, s98, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_add_i32 s8, s6, 3
+; SI-NEXT: s_add_i32 s8, s91, 3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
@@ -209805,27 +209920,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s53, s4, 0x3000000
; SI-NEXT: v_readlane_b32 s4, v62, 41
+; SI-NEXT: v_mov_b32_e32 v19, v26
; SI-NEXT: s_add_i32 s86, s4, 3
; SI-NEXT: v_readlane_b32 s5, v62, 38
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v5
; SI-NEXT: s_and_b32 s4, s86, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v19
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v10, v59, v10
-; SI-NEXT: v_or_b32_e32 v10, s4, v10
-; SI-NEXT: v_readlane_b32 s4, v62, 31
+; SI-NEXT: v_or_b32_e32 v5, v23, v5
+; SI-NEXT: v_or_b32_e32 v5, s4, v5
+; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_readlane_b32 s4, v62, 29
; SI-NEXT: s_add_i32 s66, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 27
+; SI-NEXT: v_readlane_b32 s5, v62, 26
; SI-NEXT: s_and_b32 s4, s66, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_add_i32 s37, s39, 3
+; SI-NEXT: s_add_i32 s37, s38, 3
; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: v_readlane_b32 s5, v62, 21
; SI-NEXT: s_and_b32 s8, s37, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s95, 24
+; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s8
@@ -209840,33 +209961,65 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_readlane_b32 s5, v62, 37
+; SI-NEXT: v_readlane_b32 s6, v62, 34
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: s_add_i32 s71, s22, 3
+; SI-NEXT: s_add_i32 s71, s6, 3
; SI-NEXT: s_and_b32 s8, s71, 0xff
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_add_i32 s35, s99, 3
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_or_b32_e32 v11, v32, v11
-; SI-NEXT: v_or_b32_e32 v11, s4, v11
+; SI-NEXT: s_and_b32 s6, s35, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_mov_b32_e32 v31, s9
+; SI-NEXT: v_mov_b32_e32 v39, s70
+; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v2
+; SI-NEXT: v_mov_b32_e32 v28, s39
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4
+; SI-NEXT: v_mov_b32_e32 v27, s48
+; SI-NEXT: v_mov_b32_e32 v26, s49
+; SI-NEXT: v_mov_b32_e32 v25, s94
+; SI-NEXT: v_mov_b32_e32 v24, s53
+; SI-NEXT: v_mov_b32_e32 v23, s36
+; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16
+; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16
+; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16
+; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16
+; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16
+; SI-NEXT: v_alignbit_b32 v53, v28, v36, 16
+; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16
+; SI-NEXT: v_alignbit_b32 v51, v31, v3, 16
+; SI-NEXT: s_lshr_b32 s57, s36, 16
+; SI-NEXT: s_lshr_b32 s56, s53, 16
+; SI-NEXT: s_lshr_b32 s47, s94, 16
+; SI-NEXT: s_lshr_b32 s46, s49, 16
+; SI-NEXT: s_lshr_b32 s45, s48, 16
+; SI-NEXT: s_lshr_b32 s44, s39, 16
+; SI-NEXT: s_lshr_b32 s43, s70, 16
+; SI-NEXT: s_lshr_b32 s41, s9, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v22, v5
+; SI-NEXT: v_or_b32_e32 v5, s4, v5
+; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s4, v62, 40
; SI-NEXT: s_add_i32 s85, s4, 3
; SI-NEXT: s_and_b32 s4, s85, 0xff
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_readlane_b32 s5, v62, 33
+; SI-NEXT: v_readlane_b32 s5, v62, 31
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s8, s4, 0x3000000
-; SI-NEXT: v_readlane_b32 s4, v62, 54
+; SI-NEXT: v_readlane_b32 s4, v62, 53
; SI-NEXT: s_add_i32 s17, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 53
+; SI-NEXT: v_readlane_b32 s5, v62, 52
; SI-NEXT: s_and_b32 s4, s17, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -209874,58 +210027,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_readlane_b32 s5, v62, 47
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_and_b32 s6, s35, 0xff
-; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: v_mov_b32_e32 v30, s16
-; SI-NEXT: v_mov_b32_e32 v39, s9
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2
-; SI-NEXT: v_mov_b32_e32 v28, s11
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4
-; SI-NEXT: v_mov_b32_e32 v27, s48
-; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7
-; SI-NEXT: v_mov_b32_e32 v26, s49
-; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8
-; SI-NEXT: v_mov_b32_e32 v25, s94
-; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9
-; SI-NEXT: v_mov_b32_e32 v24, s53
-; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10
-; SI-NEXT: v_mov_b32_e32 v23, s36
-; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11
; SI-NEXT: v_mov_b32_e32 v22, s8
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16
-; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16
-; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16
-; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16
-; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16
-; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16
-; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16
-; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16
-; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16
; SI-NEXT: s_lshr_b32 s58, s8, 16
-; SI-NEXT: s_lshr_b32 s57, s36, 16
-; SI-NEXT: s_lshr_b32 s56, s53, 16
-; SI-NEXT: s_lshr_b32 s47, s94, 16
-; SI-NEXT: s_lshr_b32 s46, s49, 16
-; SI-NEXT: s_lshr_b32 s45, s48, 16
-; SI-NEXT: s_lshr_b32 s44, s11, 16
-; SI-NEXT: s_lshr_b32 s43, s9, 16
-; SI-NEXT: s_lshr_b32 s41, s16, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v12, v5
+; SI-NEXT: v_or_b32_e32 v5, v20, v5
; SI-NEXT: v_or_b32_e32 v5, s4, v5
; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s4, v62, 48
; SI-NEXT: s_add_i32 s7, s4, 3
; SI-NEXT: s_and_b32 s4, s7, 0xff
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s89, 24
+; SI-NEXT: s_lshl_b32 s5, s22, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -209940,32 +210060,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v5, v19, v5
+; SI-NEXT: v_readlane_b32 s5, v62, 35
+; SI-NEXT: v_readlane_b32 s6, v62, 27
+; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: s_add_i32 s55, s6, 3
+; SI-NEXT: s_and_b32 s6, s55, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: v_mov_b32_e32 v20, s10
+; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16
+; SI-NEXT: s_lshr_b32 s59, s10, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v5, v14, v5
; SI-NEXT: v_or_b32_e32 v5, s4, v5
; SI-NEXT: v_readlane_b32 s4, v62, 36
; SI-NEXT: s_add_i32 s81, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 35
-; SI-NEXT: v_readlane_b32 s6, v62, 28
; SI-NEXT: s_and_b32 s4, s81, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_add_i32 s55, s6, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_readlane_b32 s5, v62, 26
-; SI-NEXT: s_and_b32 s6, s55, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s5, 24
-; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: s_lshl_b32 s5, s20, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s12, s4, 0x3000000
-; SI-NEXT: v_readlane_b32 s4, v62, 34
+; SI-NEXT: v_readlane_b32 s4, v62, 32
; SI-NEXT: s_add_i32 s69, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 29
+; SI-NEXT: v_readlane_b32 s5, v62, 28
; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5
; SI-NEXT: s_and_b32 s4, s69, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: s_addk_i32 s4, 0x300
@@ -209973,10 +210096,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_or_b32_e32 v5, v18, v5
; SI-NEXT: v_or_b32_e32 v5, s4, v5
-; SI-NEXT: v_readlane_b32 s4, v62, 22
+; SI-NEXT: v_readlane_b32 s4, v62, 20
; SI-NEXT: s_add_i32 s34, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 21
-; SI-NEXT: v_readlane_b32 s6, v62, 19
+; SI-NEXT: v_readlane_b32 s5, v62, 19
+; SI-NEXT: v_readlane_b32 s6, v62, 17
; SI-NEXT: s_and_b32 s4, s34, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_add_i32 s92, s6, 3
@@ -209991,21 +210114,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_add_i32 s13, s4, 0x3000000
; SI-NEXT: v_readlane_b32 s4, v62, 25
; SI-NEXT: s_add_i32 s51, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 24
+; SI-NEXT: v_readlane_b32 s5, v62, 23
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5
; SI-NEXT: s_and_b32 s4, s51, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_or_b32_e32 v5, v14, v5
+; SI-NEXT: v_or_b32_e32 v5, v61, v5
; SI-NEXT: v_or_b32_e32 v5, s4, v5
-; SI-NEXT: v_readlane_b32 s4, v62, 20
+; SI-NEXT: v_readlane_b32 s4, v62, 18
; SI-NEXT: s_add_i32 s95, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 18
+; SI-NEXT: v_readlane_b32 s5, v62, 16
; SI-NEXT: s_and_b32 s4, s95, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_add_i32 s6, s96, 3
@@ -210022,37 +210145,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s84, 8
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v6, v6, v13
+; SI-NEXT: v_or_b32_e32 v6, v60, v13
; SI-NEXT: v_or_b32_e32 v6, s4, v6
; SI-NEXT: s_add_i32 s4, s83, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s5, s25, 8
+; SI-NEXT: s_lshl_b32 s5, s68, 8
; SI-NEXT: s_add_i32 s6, s64, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_readlane_b32 s5, v62, 15
; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s5, 24
+; SI-NEXT: s_lshl_b32 s5, s65, 24
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s15, s4, 0x3000000
-; SI-NEXT: v_readlane_b32 s4, v62, 2
+; SI-NEXT: v_readlane_b32 s4, v62, 4
; SI-NEXT: s_add_i32 s4, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 1
+; SI-NEXT: v_readlane_b32 s5, v62, 3
+; SI-NEXT: v_readlane_b32 s6, v62, 2
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_add_i32 s6, s26, 3
+; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: v_readlane_b32 s5, v62, 1
; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s27, 24
+; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
@@ -210062,21 +210183,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_add_i32 s4, s4, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s29, 8
-; SI-NEXT: s_add_i32 s6, s76, 3
+; SI-NEXT: s_add_i32 s6, s11, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_readlane_b32 s5, v62, 16
; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s5, 24
+; SI-NEXT: s_lshl_b32 s5, s90, 24
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s40, s4, 0x3000000
-; SI-NEXT: v_readlane_b32 s4, v62, 7
+; SI-NEXT: v_readlane_b32 s4, v62, 10
; SI-NEXT: s_add_i32 s4, s4, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 17
-; SI-NEXT: v_readlane_b32 s6, v62, 6
+; SI-NEXT: v_readlane_b32 s5, v62, 15
+; SI-NEXT: v_readlane_b32 s6, v62, 9
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_add_i32 s6, s6, 3
@@ -210088,15 +210208,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_add_i32 s18, s4, 0x3000000
-; SI-NEXT: s_add_i32 s4, s20, 3
-; SI-NEXT: v_readlane_b32 s5, v62, 5
-; SI-NEXT: v_readlane_b32 s6, v62, 4
+; SI-NEXT: s_add_i32 s21, s4, 0x3000000
+; SI-NEXT: v_readlane_b32 s4, v62, 8
+; SI-NEXT: s_add_i32 s4, s4, 3
+; SI-NEXT: v_readlane_b32 s5, v62, 7
+; SI-NEXT: v_readlane_b32 s6, v62, 6
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_readlane_b32 s5, v62, 3
+; SI-NEXT: v_readlane_b32 s5, v62, 5
; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
; SI-NEXT: s_lshl_b32 s5, s5, 24
@@ -210105,8 +210226,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s42, s4, 0x3000000
-; SI-NEXT: v_mov_b32_e32 v13, s18
-; SI-NEXT: v_mov_b32_e32 v20, s10
+; SI-NEXT: v_mov_b32_e32 v13, s21
; SI-NEXT: v_mov_b32_e32 v19, s12
; SI-NEXT: v_mov_b32_e32 v18, s13
; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5
@@ -210118,20 +210238,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16
; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16
; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16
; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16
; SI-NEXT: s_lshr_b32 s73, s42, 16
; SI-NEXT: s_lshr_b32 s72, s40, 16
; SI-NEXT: s_lshr_b32 s63, s15, 16
; SI-NEXT: s_lshr_b32 s62, s14, 16
; SI-NEXT: s_lshr_b32 s61, s13, 16
; SI-NEXT: s_lshr_b32 s60, s12, 16
-; SI-NEXT: s_lshr_b32 s59, s10, 16
; SI-NEXT: .LBB97_3: ; %end
-; SI-NEXT: s_and_b32 s4, s18, 0xffff
+; SI-NEXT: s_and_b32 s4, s21, 0xffff
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57
; SI-NEXT: v_or_b32_e32 v5, s4, v5
; SI-NEXT: s_and_b32 s4, s42, 0xffff
@@ -210140,6 +210256,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_mov_b32_e32 v6, s4
; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen
; SI-NEXT: s_and_b32 s4, s17, 0xffff
@@ -210297,9 +210414,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0
; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v36
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53
-; SI-NEXT: s_and_b32 s4, s11, 0xffff
+; SI-NEXT: s_and_b32 s4, s39, 0xffff
; SI-NEXT: s_lshl_b32 s5, s44, 16
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
@@ -210311,7 +210428,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52
-; SI-NEXT: s_and_b32 s4, s9, 0xffff
+; SI-NEXT: s_and_b32 s4, s70, 0xffff
; SI-NEXT: s_lshl_b32 s5, s43, 16
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
@@ -210324,7 +210441,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51
-; SI-NEXT: s_and_b32 s4, s16, 0xffff
+; SI-NEXT: s_and_b32 s4, s9, 0xffff
; SI-NEXT: s_lshl_b32 s5, s41, 16
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
@@ -210385,27 +210502,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readlane_b32 s31, v63, 1
; SI-NEXT: v_readlane_b32 s30, v63, 0
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB97_4:
-; SI-NEXT: v_mov_b32_e32 v5, v13
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v25, v58
-; SI-NEXT: v_mov_b32_e32 v48, v39
-; SI-NEXT: v_mov_b32_e32 v39, v57
-; SI-NEXT: v_mov_b32_e32 v49, v56
-; SI-NEXT: v_mov_b32_e32 v20, v47
-; SI-NEXT: v_mov_b32_e32 v30, v37
-; SI-NEXT: v_mov_b32_e32 v36, v35
-; SI-NEXT: v_mov_b32_e32 v35, v45
-; SI-NEXT: v_mov_b32_e32 v27, v26
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_mov_b32_e32 v32, v23
-; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v2, v58
+; SI-NEXT: v_mov_b32_e32 v49, v57
+; SI-NEXT: v_mov_b32_e32 v30, v56
+; SI-NEXT: v_mov_b32_e32 v48, v46
+; SI-NEXT: v_mov_b32_e32 v24, v45
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_mov_b32_e32 v27, v35
+; SI-NEXT: v_mov_b32_e32 v26, v34
+; SI-NEXT: v_mov_b32_e32 v13, v21
+; SI-NEXT: ; implicit-def: $sgpr21
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr73
@@ -210457,17 +210573,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: ; implicit-def: $sgpr45
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $sgpr11
+; SI-NEXT: ; implicit-def: $sgpr39
; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $sgpr9
+; SI-NEXT: ; implicit-def: $sgpr70
; SI-NEXT: ; implicit-def: $sgpr43
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr9
; SI-NEXT: ; implicit-def: $sgpr41
; SI-NEXT: s_branch .LBB97_2
;
@@ -210531,13 +210647,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168
; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7
; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v27
; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
@@ -210549,46 +210666,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v24
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200
@@ -210597,34 +210710,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224
; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22
-; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24
+; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v26
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v28
; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v0
; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: s_waitcnt vmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -210643,6 +210759,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328
@@ -210651,12 +210772,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36
-; VI-NEXT: s_waitcnt vmcnt(11)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5
-; VI-NEXT: s_waitcnt vmcnt(10)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0
; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44
@@ -210665,47 +210782,45 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84
-; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92
-; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100
-; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116
; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124
-; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132
-; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140
-; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148
-; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156
-; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164
-; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148
+; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:156
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:164
+; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:172
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188
-; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:196
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204
-; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:212
; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220
-; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228
-; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236
-; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244
-; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:228
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:236
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:252
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284
-; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
-; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316
-; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276
+; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:284
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:300
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:324
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
@@ -210715,46 +210830,50 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: s_cbranch_scc0 .LBB97_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -210771,11 +210890,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -210799,6 +210917,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v17, v10
; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
@@ -210815,38 +210934,43 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v40, v42
+; VI-NEXT: v_mov_b32_e32 v42, v44
+; VI-NEXT: v_mov_b32_e32 v44, v45
+; VI-NEXT: v_mov_b32_e32 v45, v62
+; VI-NEXT: v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v34, v24
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -210854,77 +210978,74 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v45, v62
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v32, v1
; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v54, v22
-; VI-NEXT: v_mov_b32_e32 v41, v24
; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v34, v0
+; VI-NEXT: v_mov_b32_e32 v33, v0
; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v37, v1
-; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v55, v26
+; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v50, v26
; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v39, v0
-; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v49, v1
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v43, v27
+; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_mov_b32_e32 v51, v0
-; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v35, v1
-; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v53, v28
+; VI-NEXT: v_mov_b32_e32 v53, v1
+; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v52, v28
; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v33, v0
-; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v36, v0
+; VI-NEXT: v_mov_b32_e32 v55, v0
+; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_mov_b32_e32 v35, v0
; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v41, v1
+; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v63, v27
+; VI-NEXT: v_mov_b32_e32 v46, v57
; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v56, v0
+; VI-NEXT: v_mov_b32_e32 v36, v0
; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v58, v1
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v61, v60
-; VI-NEXT: v_mov_b32_e32 v60, v59
+; VI-NEXT: v_mov_b32_e32 v56, v1
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v61, v59
; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -210936,55 +211057,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v50, v0
+; VI-NEXT: v_mov_b32_e32 v58, v0
; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v52, v0
-; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v43, v0
+; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v46, v1
-; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v60, v1
+; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v63, v0
-; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v54, v0
+; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v47, v1
-; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_mov_b32_e32 v57, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
@@ -211016,12 +211135,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_cbranch_execnz .LBB97_3
; VI-NEXT: .LBB97_2: ; %cmp.true
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59
-; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_add_i32 s28, s28, 3
; VI-NEXT: s_and_b32 s4, s28, 0xff
; VI-NEXT: s_lshl_b32 s5, s29, 8
@@ -211040,165 +211157,147 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_lshl_b32 s9, s19, 8
; VI-NEXT: s_add_i32 s16, s16, 3
; VI-NEXT: s_lshl_b32 s10, s17, 8
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59
+; VI-NEXT: v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62
-; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44
-; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45
-; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44
+; VI-NEXT: v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42
-; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40
-; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60
-; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61
-; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v46
+; VI-NEXT: v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48
; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38
; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50
+; VI-NEXT: v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v36
; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49
; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57
-; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v58
+; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34
; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14
@@ -211207,67 +211306,78 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36
-; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52
-; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54
-; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v35
+; VI-NEXT: v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v25
+; VI-NEXT: v_add_u32_e32 v25, vcc, 0x300, v59
+; VI-NEXT: v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13
-; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25
+; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT: v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51
; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59
-; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1
+; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v63
+; VI-NEXT: v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12
-; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25
+; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26
+; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -211291,15 +211401,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10
; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55
; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53
-; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v52
+; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v53
+; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9
; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10
+; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27
-; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28
-; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -211315,18 +211424,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42
; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40
-; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1
-; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40
+; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8
; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11
-; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
@@ -211366,19 +211471,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46
; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2
+; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4
; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4
@@ -211445,35 +211560,38 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB97_4:
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v61, v60
-; VI-NEXT: v_mov_b32_e32 v60, v59
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v61, v59
+; VI-NEXT: v_mov_b32_e32 v46, v57
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v40, v42
+; VI-NEXT: v_mov_b32_e32 v42, v44
+; VI-NEXT: v_mov_b32_e32 v44, v45
; VI-NEXT: v_mov_b32_e32 v45, v62
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v57, v5
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; VI-NEXT: v_mov_b32_e32 v47, v4
-; VI-NEXT: v_mov_b32_e32 v63, v3
-; VI-NEXT: v_mov_b32_e32 v53, v28
-; VI-NEXT: v_mov_b32_e32 v43, v27
-; VI-NEXT: v_mov_b32_e32 v55, v26
-; VI-NEXT: v_mov_b32_e32 v41, v24
-; VI-NEXT: v_mov_b32_e32 v54, v22
+; VI-NEXT: v_mov_b32_e32 v54, v3
+; VI-NEXT: v_mov_b32_e32 v52, v28
+; VI-NEXT: v_mov_b32_e32 v63, v27
+; VI-NEXT: v_mov_b32_e32 v50, v26
+; VI-NEXT: v_mov_b32_e32 v34, v24
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; VI-NEXT: s_branch .LBB97_2
@@ -211535,18 +211653,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11
; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13
-; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17
-; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17
; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19
; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21
; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27
; GFX9-NEXT: s_waitcnt vmcnt(24)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43
; GFX9-NEXT: s_waitcnt vmcnt(23)
@@ -211575,10 +211693,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
@@ -211590,7 +211708,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(23)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
@@ -211638,7 +211756,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
@@ -211665,23 +211783,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320
@@ -211694,48 +211812,49 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(15)
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; GFX9-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84
-; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100
; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:116
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132
-; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:140
; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148
-; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:156
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164
-; GFX9-NEXT: s_waitcnt vmcnt(21)
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: s_waitcnt vmcnt(22)
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172
-; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180
-; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188
-; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196
-; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:172
+; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:180
+; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188
+; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204
; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212
-; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220
-; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228
+; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:220
+; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:228
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236
; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244
-; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:260
; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268
-; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:276
; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284
; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292
-; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300
-; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:300
+; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:308
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:316
; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
@@ -211746,55 +211865,54 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(33)
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(36)
-; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: s_waitcnt vmcnt(38)
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(40)
; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
@@ -211804,7 +211922,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: s_cbranch_scc0 .LBB97_2
@@ -211817,7 +211935,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -211854,10 +211972,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
@@ -211873,13 +211991,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -211887,7 +212005,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -211928,8 +212046,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v52, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v50, v0
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -211947,16 +212065,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v48, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_mov_b32_e32 v33, v45
+; GFX9-NEXT: v_mov_b32_e32 v33, v46
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -211969,7 +212087,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -211978,7 +212096,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -211986,121 +212104,122 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v34, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_mov_b32_e32 v46, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v35, v45
-; GFX9-NEXT: v_mov_b32_e32 v45, v61
-; GFX9-NEXT: v_mov_b32_e32 v61, v42
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v38, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v54, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v41, v57
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v57, v59
; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v56, v42
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_branch .LBB97_3
; GFX9-NEXT: .LBB97_2:
; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT: v_mov_b32_e32 v33, v45
-; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v33, v46
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: v_mov_b32_e32 v56, v61
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX9-NEXT: .LBB97_3: ; %Flow
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
@@ -212303,7 +212422,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v1, 3, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -212363,11 +212482,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v26, 3, v26
@@ -212402,7 +212521,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; GFX9-NEXT: v_add_u32_e32 v24, 3, v24
-; GFX9-NEXT: v_add_u32_e32 v26, 3, v61
+; GFX9-NEXT: v_add_u32_e32 v26, 3, v62
; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24
; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48
@@ -212411,7 +212530,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; GFX9-NEXT: v_add_u32_e32 v26, 3, v45
+; GFX9-NEXT: v_add_u32_e32 v26, 3, v61
; GFX9-NEXT: v_add_u32_e32 v20, 3, v20
; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20
@@ -212420,7 +212539,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: v_add_u32_e32 v26, 3, v56
; GFX9-NEXT: v_add_u32_e32 v21, 3, v21
-; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21
; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54
; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21
@@ -216299,1192 +216418,1031 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21
-; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v12
+; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v29
+; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v28
+; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v27
+; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v26
+; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v24
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v18
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr32
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; kill: killed $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr44
; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB98_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v31, v7
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v10
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v11
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v12
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v13
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v14
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v16
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v9, v8
-; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v7, v6
-; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v5, v3
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
-; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v36
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v37
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v30
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v27
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v28
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v25
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v26
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v23
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v24
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v22
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v19
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v20
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v17
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v18
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v1, v46
-; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18]
-; VI-NEXT: v_mov_b32_e32 v32, v15
-; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24
-; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20
-; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18
-; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17
-; VI-NEXT: v_mov_b32_e32 v46, v1
-; VI-NEXT: ; implicit-def: $vgpr1
-; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: ; implicit-def: $vgpr13
-; VI-NEXT: ; implicit-def: $vgpr15
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr21
-; VI-NEXT: ; implicit-def: $vgpr23
-; VI-NEXT: ; implicit-def: $vgpr25
-; VI-NEXT: ; implicit-def: $vgpr27
-; VI-NEXT: ; implicit-def: $vgpr29
-; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6]
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10]
+; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4]
+; VI-NEXT: v_mov_b32_e32 v34, v47
+; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v47, v34
+; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[31:32]
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v34, v36
+; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
+; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; VI-NEXT: v_mov_b32_e32 v36, v34
+; VI-NEXT: v_mov_b32_e32 v34, v50
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; VI-NEXT: v_mov_b32_e32 v39, v62
+; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[21:22]
+; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8]
+; VI-NEXT: v_mov_b32_e32 v50, v34
+; VI-NEXT: v_mov_b32_e32 v62, v39
+; VI-NEXT: v_mov_b32_e32 v34, v40
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20]
+; VI-NEXT: v_mov_b32_e32 v40, v34
+; VI-NEXT: v_mov_b32_e32 v34, v43
+; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[17:18]
+; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v20
+; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v18
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v17
+; VI-NEXT: v_mov_b32_e32 v43, v34
; VI-NEXT: .LBB98_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB98_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_mov_b32_e32 v31, 3
-; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v32, 3, v18
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v18, v32, v18
-; VI-NEXT: v_add_u16_e32 v32, 3, v17
-; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v33, 3
+; VI-NEXT: v_add_u16_e32 v34, 3, v18
+; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; VI-NEXT: v_or_b32_e32 v35, v34, v18
+; VI-NEXT: v_add_u16_e32 v18, 3, v17
+; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v17, v32, v17
-; VI-NEXT: v_add_u16_e32 v32, 3, v20
-; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v20, v32, v20
-; VI-NEXT: v_add_u16_e32 v32, 3, v19
-; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v19, v32, v19
-; VI-NEXT: v_add_u16_e32 v32, 3, v22
-; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48
-; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v22, v32, v22
-; VI-NEXT: v_add_u16_e32 v32, 3, v21
-; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53
-; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v21, v32, v21
-; VI-NEXT: v_add_u16_e32 v32, 3, v24
-; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v24, v32, v24
-; VI-NEXT: v_add_u16_e32 v32, 3, v23
-; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v23, v32, v23
-; VI-NEXT: v_add_u16_e32 v32, 3, v26
-; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v26, v32, v26
-; VI-NEXT: v_add_u16_e32 v32, 3, v25
-; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v25, v32, v25
-; VI-NEXT: v_add_u16_e32 v32, 3, v28
-; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v28, v32, v28
-; VI-NEXT: v_add_u16_e32 v32, 3, v27
-; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v27, v32, v27
-; VI-NEXT: v_add_u16_e32 v33, 3, v30
-; VI-NEXT: v_add_u16_e32 v34, 3, v29
-; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35
-; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v30, v33, v29
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32
-; VI-NEXT: v_add_u16_e32 v33, 3, v37
-; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v29, v34, v29
-; VI-NEXT: v_add_u16_e32 v34, 3, v36
-; VI-NEXT: v_or_b32_e32 v37, v33, v32
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50
-; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v36, v34, v32
-; VI-NEXT: v_add_u16_e32 v33, 3, v2
-; VI-NEXT: v_add_u16_e32 v34, 3, v1
-; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57
-; VI-NEXT: v_or_b32_e32 v2, v33, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32
-; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v1, v34, v1
-; VI-NEXT: v_add_u16_e32 v33, 3, v4
-; VI-NEXT: v_add_u16_e32 v34, 3, v3
-; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56
-; VI-NEXT: v_or_b32_e32 v4, v33, v3
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32
-; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v3, v34, v3
-; VI-NEXT: v_add_u16_e32 v33, 3, v6
-; VI-NEXT: v_add_u16_e32 v34, 3, v5
-; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47
-; VI-NEXT: v_or_b32_e32 v6, v33, v5
-; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v5, v34, v5
-; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_add_u16_e32 v38, 3, v8
-; VI-NEXT: v_add_u16_e32 v33, 3, v7
-; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34
-; VI-NEXT: v_or_b32_e32 v8, v38, v7
-; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32
-; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v7, v33, v7
-; VI-NEXT: v_add_u16_e32 v33, 3, v10
-; VI-NEXT: v_add_u16_e32 v38, 3, v9
-; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59
-; VI-NEXT: v_or_b32_e32 v10, v33, v9
-; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32
-; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v9, v38, v9
-; VI-NEXT: v_add_u16_e32 v33, 3, v12
-; VI-NEXT: v_add_u16_e32 v38, 3, v11
-; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v12, v33, v11
-; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
-; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v11, v38, v11
-; VI-NEXT: v_add_u16_e32 v38, 3, v14
-; VI-NEXT: v_add_u16_e32 v49, 3, v13
-; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33
-; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v14, v38, v13
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32
-; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v34, v18, v17
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v17, v20, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v34, 3, v20
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v35, v34, v17
+; VI-NEXT: v_add_u16_sdwa v17, v19, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v20, 3, v19
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v34, v20, v17
+; VI-NEXT: v_add_u16_sdwa v17, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v19, 3, v22
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v61, v19, v17
+; VI-NEXT: v_add_u16_sdwa v17, v21, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v22, 3, v21
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v60, v22, v17
+; VI-NEXT: v_add_u16_sdwa v17, v24, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v19, 3, v24
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v63, v19, v17
+; VI-NEXT: v_add_u16_sdwa v17, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v24, 3, v23
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v62, v24, v17
+; VI-NEXT: v_add_u16_sdwa v17, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v19, 3, v26
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v58, v19, v17
+; VI-NEXT: v_add_u16_sdwa v17, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v26, 3, v25
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v57, v26, v17
+; VI-NEXT: v_add_u16_sdwa v17, v28, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v19, 3, v28
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v56, v19, v17
+; VI-NEXT: v_add_u16_sdwa v17, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v28, 3, v27
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v55, v28, v17
+; VI-NEXT: v_add_u16_sdwa v17, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v21, 3, v30
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v19, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_add_u16_e32 v30, 3, v29
+; VI-NEXT: v_or_b32_e32 v40, v21, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v19
+; VI-NEXT: v_or_b32_e32 v39, v30, v17
+; VI-NEXT: v_add_u16_sdwa v17, v32, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v45, 3, v32
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v19, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v32, 3, v31
+; VI-NEXT: v_or_b32_e32 v38, v45, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v19
+; VI-NEXT: v_add_u16_sdwa v21, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v37, v32, v17
+; VI-NEXT: v_add_u16_e32 v17, 3, v2
+; VI-NEXT: v_add_u16_e32 v2, 3, v1
+; VI-NEXT: v_add_u16_sdwa v19, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21
+; VI-NEXT: v_or_b32_e32 v49, v17, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19
+; VI-NEXT: v_or_b32_e32 v48, v2, v1
+; VI-NEXT: v_add_u16_sdwa v1, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v19, 3, v4
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v4, 3, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_or_b32_e32 v47, v19, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_or_b32_e32 v46, v4, v1
+; VI-NEXT: v_add_u16_sdwa v1, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v21, 3, v6
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v3, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_add_u16_e32 v6, 3, v5
+; VI-NEXT: v_or_b32_e32 v44, v21, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_or_b32_e32 v43, v6, v1
+; VI-NEXT: v_add_u16_sdwa v1, v8, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v23, 3, v8
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v3, v7, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_add_u16_e32 v8, 3, v7
+; VI-NEXT: v_or_b32_e32 v42, v23, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_or_b32_e32 v41, v8, v1
+; VI-NEXT: v_add_u16_sdwa v1, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v25, 3, v10
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v3, v9, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_add_u16_e32 v10, 3, v9
+; VI-NEXT: v_or_b32_e32 v54, v25, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_add_u16_sdwa v52, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v53, v10, v1
+; VI-NEXT: v_add_u16_e32 v27, 3, v12
+; VI-NEXT: v_add_u16_sdwa v3, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52
+; VI-NEXT: v_add_u16_e32 v12, 3, v11
+; VI-NEXT: v_or_b32_e32 v51, v27, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_or_b32_e32 v50, v12, v1
+; VI-NEXT: v_add_u16_sdwa v1, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_e32 v29, 3, v14
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u16_sdwa v3, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_add_u16_e32 v14, 3, v13
+; VI-NEXT: v_or_b32_e32 v36, v29, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_add_u16_sdwa v59, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v35, v14, v1
+; VI-NEXT: v_add_u16_sdwa v3, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_e32 v16, 3, v16
-; VI-NEXT: v_add_u16_e32 v32, 3, v15
-; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v16, v16, v15
-; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v15, v32, v15
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15
-; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_or_b32_e32 v13, v49, v13
-; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14
-; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13
-; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9
-; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7
-; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5
-; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
-; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4]
-; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v60, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v33, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v63, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v59, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v34, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v47, 8, 8
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v56, 8, 8
-; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v57, 8, 8
-; VI-NEXT: v_mov_b32_e32 v46, v35
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v52, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v46, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v39, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v49, v53
-; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT: v_mov_b32_e32 v52, v51
-; VI-NEXT: v_bfe_u32 v31, v51, 8, 8
-; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22]
-; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17
-; VI-NEXT: v_bfe_u32 v35, v58, 8, 8
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v39, v61, 8, 8
-; VI-NEXT: v_bfe_u32 v58, v48, 8, 8
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v59
+; VI-NEXT: v_add_u16_e32 v15, 3, v15
+; VI-NEXT: v_or_b32_e32 v34, v16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; VI-NEXT: v_or_b32_e32 v33, v15, v1
+; VI-NEXT: v_mov_b32_e32 v31, v32
+; VI-NEXT: v_mov_b32_e32 v32, v45
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v34
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, v17
+; VI-NEXT: v_mov_b32_e32 v3, v4
+; VI-NEXT: v_mov_b32_e32 v4, v19
+; VI-NEXT: v_mov_b32_e32 v5, v6
+; VI-NEXT: v_mov_b32_e32 v6, v21
+; VI-NEXT: v_mov_b32_e32 v7, v8
+; VI-NEXT: v_mov_b32_e32 v8, v23
+; VI-NEXT: v_mov_b32_e32 v9, v10
+; VI-NEXT: v_mov_b32_e32 v10, v25
+; VI-NEXT: v_mov_b32_e32 v11, v12
+; VI-NEXT: v_mov_b32_e32 v12, v27
+; VI-NEXT: v_mov_b32_e32 v13, v14
+; VI-NEXT: v_mov_b32_e32 v14, v29
+; VI-NEXT: v_mov_b32_e32 v17, v18
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v19, v20
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v21, v22
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v23, v24
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v25, v26
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v27, v28
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: v_mov_b32_e32 v29, v30
+; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v33
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[33:34]
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v36
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v35
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[35:36]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v51
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v50
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[50:51]
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v54
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v53
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[53:54]
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v42
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v41
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v44
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v43
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v47
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v46
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v49
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v48
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v38
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[43:44]
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v37
+; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[46:47]
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[37:38]
+; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[48:49]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v40
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v39
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[39:40]
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v56
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v55
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v58
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v57
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v63
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v62
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v61
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v60
+; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[55:56]
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[62:63]
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[60:61]
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[57:58]
+; VI-NEXT: v_mov_b32_e32 v57, v52
+; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[41:42]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v55
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v54
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[54:55]
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v59, 8, 8
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v54
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_bfe_u32 v61, v53, 8, 8
+; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[54:55]
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v55
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_bfe_u32 v34, v59, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v52, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v62, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v52, v34
+; VI-NEXT: v_bfe_u32 v34, v34, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v54, v34
+; VI-NEXT: v_bfe_u32 v34, v34, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v51, v34
+; VI-NEXT: v_bfe_u32 v34, v34, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v56, v34
+; VI-NEXT: v_bfe_u32 v60, v34, 8, 8
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v34, v34, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v36, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v50, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v53, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v34, v40, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v34, v34, 8, 8
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_bfe_u32 v48, v34, 8, 8
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v63, v34
+; VI-NEXT: v_bfe_u32 v55, v34, 8, 8
; VI-NEXT: .LBB98_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v46
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v34, v46, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60
+; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58
+; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31
-; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55
+; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44
-; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61
-; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41
-; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35
+; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -219846,8 +219804,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s82, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24
; SI-NEXT: s_waitcnt vmcnt(14)
@@ -219878,9 +219836,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v43
; SI-NEXT: v_writelane_b32 v62, s6, 0
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB99_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xffff
@@ -219892,7 +219848,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v1, s56
; SI-NEXT: s_and_b32 s4, s20, 0xffff
; SI-NEXT: s_lshl_b32 s5, s21, 16
-; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24
+; SI-NEXT: v_alignbit_b32 v5, s57, v1, 24
; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16
; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8
; SI-NEXT: s_or_b32 s46, s4, s5
@@ -219904,39 +219860,43 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v1, s46
; SI-NEXT: s_and_b32 s4, s24, 0xffff
; SI-NEXT: s_lshl_b32 s5, s25, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24
+; SI-NEXT: v_alignbit_b32 v5, s47, v1, 24
; SI-NEXT: s_or_b32 s44, s4, s5
; SI-NEXT: s_and_b32 s4, s26, 0xffff
; SI-NEXT: s_lshl_b32 s5, s27, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16
+; SI-NEXT: v_alignbit_b32 v5, s47, v1, 16
; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8
; SI-NEXT: s_or_b32 s45, s4, s5
; SI-NEXT: v_mov_b32_e32 v1, s44
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v5, s45, v1, 24
; SI-NEXT: s_and_b32 s4, s28, 0xffff
; SI-NEXT: s_lshl_b32 s5, s29, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24
+; SI-NEXT: v_alignbit_b32 v5, s45, v1, 16
+; SI-NEXT: v_alignbit_b32 v1, s45, v1, 8
; SI-NEXT: s_or_b32 s42, s4, s5
; SI-NEXT: s_and_b32 s4, s82, 0xffff
; SI-NEXT: s_lshl_b32 s5, s81, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16
-; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_or_b32 s43, s4, s5
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s42
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v5, s43, v1, 24
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v5, s43, v1, 16
+; SI-NEXT: v_alignbit_b32 v1, s43, v1, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16
-; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3
; SI-NEXT: s_and_b32 s4, s85, 0xffff
; SI-NEXT: s_lshl_b32 s5, s84, 16
@@ -219986,7 +219946,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_or_b32 s10, s4, s5
; SI-NEXT: s_and_b32 s4, s75, 0xffff
; SI-NEXT: s_lshl_b32 s5, s74, 16
-; SI-NEXT: v_or_b32_e32 v12, v1, v5
+; SI-NEXT: v_or_b32_e32 v12, v1, v6
; SI-NEXT: s_or_b32 s9, s4, s5
; SI-NEXT: s_and_b32 s4, s77, 0xffff
; SI-NEXT: s_lshl_b32 s5, s76, 16
@@ -220010,7 +219970,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
; SI-NEXT: v_writelane_b32 v62, s4, 1
; SI-NEXT: s_lshr_b32 s4, s10, 8
-; SI-NEXT: v_or_b32_e32 v10, v1, v6
+; SI-NEXT: v_or_b32_e32 v10, v1, v8
; SI-NEXT: v_writelane_b32 v62, s4, 3
; SI-NEXT: s_lshr_b32 s4, s9, 8
; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24
@@ -220031,32 +219991,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19
; SI-NEXT: v_writelane_b32 v62, s4, 15
; SI-NEXT: s_and_b32 s4, s72, 0xffff
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v28, v8
; SI-NEXT: v_or_b32_e32 v8, v1, v9
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23
; SI-NEXT: v_writelane_b32 v62, s4, 2
; SI-NEXT: s_and_b32 s4, s74, 0xffff
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_or_b32_e32 v5, v1, v13
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
; SI-NEXT: v_writelane_b32 v62, s4, 5
; SI-NEXT: s_and_b32 s4, s76, 0xffff
-; SI-NEXT: v_mov_b32_e32 v28, v13
+; SI-NEXT: v_mov_b32_e32 v30, v13
; SI-NEXT: v_or_b32_e32 v13, v1, v17
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31
; SI-NEXT: v_writelane_b32 v62, s4, 8
; SI-NEXT: s_and_b32 s4, s78, 0xffff
-; SI-NEXT: v_mov_b32_e32 v26, v9
+; SI-NEXT: v_mov_b32_e32 v29, v9
; SI-NEXT: v_or_b32_e32 v9, v1, v18
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32
; SI-NEXT: v_writelane_b32 v62, s4, 11
; SI-NEXT: s_and_b32 s4, s88, 0xffff
-; SI-NEXT: v_mov_b32_e32 v25, v6
+; SI-NEXT: v_mov_b32_e32 v26, v6
; SI-NEXT: v_or_b32_e32 v6, v1, v20
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33
; SI-NEXT: v_writelane_b32 v62, s4, 14
; SI-NEXT: s_bfe_u32 s4, s74, 0x80008
+; SI-NEXT: v_mov_b32_e32 v25, v4
; SI-NEXT: v_or_b32_e32 v4, v1, v21
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34
; SI-NEXT: v_writelane_b32 v62, s4, 4
@@ -220068,12 +220030,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_or_b32_e32 v1, v1, v24
; SI-NEXT: v_writelane_b32 v62, s4, 10
; SI-NEXT: s_bfe_u32 s4, s88, 0x80008
-; SI-NEXT: v_mov_b32_e32 v29, v17
-; SI-NEXT: v_mov_b32_e32 v30, v18
-; SI-NEXT: v_mov_b32_e32 v36, v20
-; SI-NEXT: v_mov_b32_e32 v37, v21
-; SI-NEXT: v_mov_b32_e32 v38, v22
-; SI-NEXT: v_mov_b32_e32 v39, v24
+; SI-NEXT: v_mov_b32_e32 v36, v17
+; SI-NEXT: v_mov_b32_e32 v37, v18
+; SI-NEXT: v_mov_b32_e32 v38, v20
+; SI-NEXT: v_mov_b32_e32 v39, v21
+; SI-NEXT: v_mov_b32_e32 v48, v22
+; SI-NEXT: v_mov_b32_e32 v49, v24
; SI-NEXT: s_lshr_b32 s68, s57, 8
; SI-NEXT: s_lshr_b32 s65, s47, 8
; SI-NEXT: s_lshr_b32 s54, s45, 8
@@ -220134,9 +220096,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8
; SI-NEXT: s_cbranch_execnz .LBB99_3
; SI-NEXT: .LBB99_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_and_b32 s4, s18, 0xffff
; SI-NEXT: s_lshl_b32 s5, s88, 16
@@ -220248,50 +220208,54 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24
-; SI-NEXT: s_lshr_b32 s4, s11, 8
-; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16
; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8
; SI-NEXT: v_mov_b32_e32 v21, s44
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24
+; SI-NEXT: s_lshr_b32 s4, s11, 8
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16
+; SI-NEXT: v_alignbit_b32 v21, s45, v21, 8
; SI-NEXT: v_writelane_b32 v62, s4, 1
; SI-NEXT: s_lshr_b32 s4, s10, 16
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24
+; SI-NEXT: v_mov_b32_e32 v21, s42
; SI-NEXT: v_writelane_b32 v62, s4, 2
; SI-NEXT: s_lshr_b32 s4, s10, 8
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_or_b32_e32 v3, v16, v3
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16
-; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8
-; SI-NEXT: v_mov_b32_e32 v21, s42
+; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24
; SI-NEXT: v_writelane_b32 v62, s4, 3
; SI-NEXT: s_lshr_b32 s4, s9, 24
-; SI-NEXT: v_or_b32_e32 v5, v36, v5
+; SI-NEXT: v_or_b32_e32 v5, v38, v5
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3
; SI-NEXT: v_mov_b32_e32 v3, s41
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24
+; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16
+; SI-NEXT: v_alignbit_b32 v21, s43, v21, 8
; SI-NEXT: v_writelane_b32 v62, s4, 4
; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31
-; SI-NEXT: v_or_b32_e32 v7, v14, v7
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v7, v25, v7
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16
-; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8
; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24
; SI-NEXT: v_writelane_b32 v62, s4, 5
; SI-NEXT: s_lshr_b32 s4, s9, 8
@@ -220305,7 +220269,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8
; SI-NEXT: v_writelane_b32 v62, s4, 6
; SI-NEXT: s_lshr_b32 s4, s8, 24
-; SI-NEXT: v_or_b32_e32 v5, v30, v5
+; SI-NEXT: v_or_b32_e32 v5, v37, v5
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -220314,7 +220278,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s4, s8, 16
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27
-; SI-NEXT: v_or_b32_e32 v11, v12, v11
+; SI-NEXT: v_or_b32_e32 v11, v26, v11
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16
@@ -220329,7 +220293,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8
; SI-NEXT: v_writelane_b32 v62, s4, 9
; SI-NEXT: s_lshr_b32 s4, s7, 24
-; SI-NEXT: v_or_b32_e32 v5, v29, v5
+; SI-NEXT: v_or_b32_e32 v5, v36, v5
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -220342,7 +220306,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19
-; SI-NEXT: v_or_b32_e32 v10, v25, v10
+; SI-NEXT: v_or_b32_e32 v10, v28, v10
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16
@@ -220360,11 +220324,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8
; SI-NEXT: v_writelane_b32 v62, s4, 12
; SI-NEXT: s_lshr_b32 s4, s6, 24
-; SI-NEXT: v_or_b32_e32 v1, v39, v1
-; SI-NEXT: v_or_b32_e32 v2, v38, v2
-; SI-NEXT: v_or_b32_e32 v4, v37, v4
-; SI-NEXT: v_or_b32_e32 v5, v28, v5
-; SI-NEXT: v_or_b32_e32 v8, v26, v8
+; SI-NEXT: v_or_b32_e32 v1, v49, v1
+; SI-NEXT: v_or_b32_e32 v2, v48, v2
+; SI-NEXT: v_or_b32_e32 v4, v39, v4
+; SI-NEXT: v_or_b32_e32 v5, v30, v5
+; SI-NEXT: v_or_b32_e32 v8, v29, v8
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24
@@ -220532,65 +220496,64 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_mov_b32_e32 v7, s4
; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_and_b32 s4, s44, 0xff
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v3, s4, v3
; SI-NEXT: s_and_b32 s4, s45, 0xff
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_and_b32 s5, s66, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s5, s16, s5
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_lshl_b32 s5, s51, 8
-; SI-NEXT: s_lshl_b32 s16, s67, 24
-; SI-NEXT: v_readlane_b32 s67, v63, 19
-; SI-NEXT: v_readlane_b32 s66, v63, 18
-; SI-NEXT: v_readlane_b32 s51, v63, 11
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_and_b32 s5, s66, 0xff
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: s_lshl_b32 s5, s5, 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v7, v11, v7
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: s_or_b32 s5, s16, s5
; SI-NEXT: v_or_b32_e32 v3, v3, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0
+; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_mov_b32_e32 v7, s4
; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_and_b32 s4, s42, 0xff
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48
+; SI-NEXT: s_lshl_b32 s5, s51, 8
+; SI-NEXT: s_lshl_b32 s16, s67, 24
+; SI-NEXT: v_readlane_b32 s67, v63, 19
+; SI-NEXT: v_readlane_b32 s66, v63, 18
+; SI-NEXT: v_readlane_b32 s51, v63, 11
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v3, s4, v3
; SI-NEXT: s_and_b32 s4, s43, 0xff
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_and_b32 s5, s55, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s5, s16, s5
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_lshl_b32 s5, s48, 8
-; SI-NEXT: s_lshl_b32 s16, s64, 24
-; SI-NEXT: v_readlane_b32 s64, v63, 16
-; SI-NEXT: v_readlane_b32 s55, v63, 15
-; SI-NEXT: v_readlane_b32 s48, v63, 8
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_and_b32 s5, s55, 0xff
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: s_lshl_b32 s5, s5, 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v7, v11, v7
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: s_or_b32 s5, s16, s5
; SI-NEXT: v_or_b32_e32 v3, v3, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0
+; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
@@ -220600,16 +220563,21 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v3, 0xff, v16
; SI-NEXT: s_and_b32 s4, s41, 0xff
+; SI-NEXT: s_lshl_b32 s5, s48, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s52, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
+; SI-NEXT: s_lshl_b32 s16, s64, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s16, s5
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_lshl_b32 s5, s37, 8
; SI-NEXT: s_lshl_b32 s16, s53, 24
+; SI-NEXT: v_readlane_b32 s64, v63, 16
+; SI-NEXT: v_readlane_b32 s55, v63, 15
; SI-NEXT: v_readlane_b32 s53, v63, 13
; SI-NEXT: v_readlane_b32 s52, v63, 12
+; SI-NEXT: v_readlane_b32 s48, v63, 8
; SI-NEXT: v_readlane_b32 s37, v63, 5
; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -220969,55 +220937,62 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
; SI-NEXT: ; implicit-def: $vcc_lo
-; SI-NEXT: v_mov_b32_e32 v39, v24
+; SI-NEXT: v_mov_b32_e32 v49, v24
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v38, v22
+; SI-NEXT: v_mov_b32_e32 v48, v22
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v37, v21
+; SI-NEXT: v_mov_b32_e32 v39, v21
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v36, v20
+; SI-NEXT: v_mov_b32_e32 v38, v20
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v30, v18
+; SI-NEXT: v_mov_b32_e32 v37, v18
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v29, v17
+; SI-NEXT: v_mov_b32_e32 v36, v17
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v28, v13
+; SI-NEXT: v_mov_b32_e32 v30, v13
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v26, v9
+; SI-NEXT: v_mov_b32_e32 v29, v9
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: v_mov_b32_e32 v25, v6
+; SI-NEXT: v_mov_b32_e32 v28, v8
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: v_mov_b32_e32 v26, v6
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: v_mov_b32_e32 v25, v4
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $vcc_lo
; SI-NEXT: ; implicit-def: $vcc_lo
; SI-NEXT: ; implicit-def: $sgpr56
@@ -221033,13 +221008,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr69
; SI-NEXT: ; implicit-def: $sgpr80
; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $sgpr45
; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; implicit-def: $sgpr66
; SI-NEXT: ; implicit-def: $sgpr70
; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $sgpr43
; SI-NEXT: ; implicit-def: $sgpr51
; SI-NEXT: ; implicit-def: $sgpr55
@@ -221088,12 +221061,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr9
; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; implicit-def: $sgpr7
-; SI-NEXT: ; kill: killed $vcc_lo
-; SI-NEXT: ; implicit-def: $vcc_lo
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: ; kill: killed $vcc_lo
+; SI-NEXT: ; implicit-def: $vcc_lo
; SI-NEXT: ; kill: killed $vcc_lo
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr24
@@ -221120,7 +221093,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -222717,7 +222689,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22
@@ -222725,7 +222697,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22
-; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22
@@ -223024,7 +222996,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_readlane_b32 s4, v62, 22
; GFX9-NEXT: v_mov_b32_e32 v60, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 23
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v17, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 24
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
@@ -223032,7 +223004,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: v_readlane_b32 s4, v62, 25
; GFX9-NEXT: v_mov_b32_e32 v23, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 26
-; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v17, s4
; GFX9-NEXT: v_readlane_b32 s4, v62, 27
; GFX9-NEXT: v_mov_b32_e32 v59, s4
@@ -223302,14 +223274,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -229368,8 +229340,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -229393,403 +229365,431 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22
; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23
; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24
; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v28
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22
; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24
; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v27, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v28, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v29, 1.0, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30
; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38
+; SI-NEXT: s_waitcnt vmcnt(12) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48
-; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v50
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v42
+; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v49
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v42
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB101_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v42, v51
-; SI-NEXT: v_mov_b32_e32 v55, v50
-; SI-NEXT: v_mov_b32_e32 v40, v52
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v15, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_mov_b32_e32 v20, v44
+; SI-NEXT: v_mov_b32_e32 v22, v21
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v18, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v24, v47
; SI-NEXT: v_mov_b32_e32 v23, v46
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_mov_b32_e32 v25, v56
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v26, v57
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v27, v57
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_mov_b32_e32 v28, v26
+; SI-NEXT: v_mov_b32_e32 v29, v58
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_mov_b32_e32 v30, v14
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v45, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v36, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
; SI-NEXT: v_cvt_f32_f16_e32 v59, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35
-; SI-NEXT: v_mov_b32_e32 v35, v43
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61
; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v38, v10
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63
; SI-NEXT: v_cvt_f32_f16_e32 v61, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mov_b32_e32 v19, v28
-; SI-NEXT: v_mov_b32_e32 v28, v14
-; SI-NEXT: v_mov_b32_e32 v39, v22
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v19, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v62, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v1
+; SI-NEXT: v_mov_b32_e32 v44, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v62, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46
+; SI-NEXT: v_mov_b32_e32 v46, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v63, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT: v_mov_b32_e32 v47, v3
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49
-; SI-NEXT: v_mov_b32_e32 v49, v15
-; SI-NEXT: v_mov_b32_e32 v15, v41
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52
-; SI-NEXT: v_mov_b32_e32 v51, v53
-; SI-NEXT: v_mov_b32_e32 v53, v54
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT: v_mov_b32_e32 v20, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33
+; SI-NEXT: v_mov_b32_e32 v33, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v47, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v48, v10
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37
-; SI-NEXT: v_mov_b32_e32 v37, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
; SI-NEXT: v_cvt_f32_f16_e32 v56, v2
-; SI-NEXT: v_mov_b32_e32 v27, v58
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v3
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v9
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v13
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26
+; SI-NEXT: v_mov_b32_e32 v26, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v21, v4
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35
+; SI-NEXT: v_mov_b32_e32 v35, v55
+; SI-NEXT: v_mov_b32_e32 v55, v53
+; SI-NEXT: v_mov_b32_e32 v34, v43
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v50
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT: v_mov_b32_e32 v32, v42
+; SI-NEXT: v_mov_b32_e32 v42, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v51
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31
+; SI-NEXT: v_mov_b32_e32 v31, v36
+; SI-NEXT: v_mov_b32_e32 v36, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT: v_mov_b32_e32 v37, v24
+; SI-NEXT: v_mov_b32_e32 v24, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: v_mov_b32_e32 v38, v40
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_mov_b32_e32 v40, v50
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: s_waitcnt vmcnt(7) expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v52
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52
+; SI-NEXT: v_mov_b32_e32 v54, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
; SI-NEXT: s_cbranch_execnz .LBB101_3
; SI-NEXT: .LBB101_2: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v36
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -229798,10 +229798,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28
@@ -229810,335 +229810,329 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v63
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v62, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v63
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v61, v61
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v59, v59
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v58
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v43
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v56
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v39
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v51
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v23
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v25
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v26
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v27
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v24
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v45, v45
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v22
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v58
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v25
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v55
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v54
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v52
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v42
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v14
; SI-NEXT: v_cvt_f32_f16_e32 v14, v53
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v48
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v14, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v7
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v14, v44
; SI-NEXT: v_cvt_f32_f16_e32 v44, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v7
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v28
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v30
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v14, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v43
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v14, v40
; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v13
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v52
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v55
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v49
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f32_f16_e32 v43, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v51
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v50
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v14
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: .LBB101_3: ; %end
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v53, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v35
; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_or_b32_e32 v14, v14, v53
; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v38
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0
; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v16
; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
@@ -230154,7 +230148,7 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
@@ -230163,57 +230157,61 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v37
; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v17
; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v19
; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v34
; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v19
; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v26
; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v33
; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -230221,13 +230219,13 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v21
; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v59
; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0
@@ -230236,25 +230234,21 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v44
; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v46
; SI-NEXT: v_add_i32_e32 v16, vcc, 60, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v62
; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0
@@ -230265,14 +230259,16 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v63
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v47
; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v46
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_or_b32_e32 v1, v14, v1
; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -230282,21 +230278,23 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v57
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v58
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v57
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
@@ -230305,13 +230303,11 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
@@ -230323,20 +230319,20 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v58
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
@@ -230348,33 +230344,35 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v54
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v34
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -230399,20 +230397,17 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB101_4:
-; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v55, v53
+; SI-NEXT: v_mov_b32_e32 v30, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v53, v54
-; SI-NEXT: v_mov_b32_e32 v40, v52
-; SI-NEXT: v_mov_b32_e32 v55, v50
-; SI-NEXT: v_mov_b32_e32 v42, v51
-; SI-NEXT: v_mov_b32_e32 v28, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -230436,70 +230431,75 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_mov_b32_e32 v27, v58
-; SI-NEXT: v_mov_b32_e32 v26, v57
+; SI-NEXT: v_mov_b32_e32 v40, v50
+; SI-NEXT: v_mov_b32_e32 v42, v41
+; SI-NEXT: v_mov_b32_e32 v36, v54
+; SI-NEXT: v_mov_b32_e32 v29, v58
+; SI-NEXT: v_mov_b32_e32 v28, v26
+; SI-NEXT: v_mov_b32_e32 v27, v57
; SI-NEXT: v_mov_b32_e32 v25, v56
-; SI-NEXT: v_mov_b32_e32 v24, v47
; SI-NEXT: v_mov_b32_e32 v23, v46
+; SI-NEXT: v_mov_b32_e32 v22, v21
+; SI-NEXT: v_mov_b32_e32 v20, v44
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; kill: killed $vgpr7
; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; kill: killed $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr9
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; kill: killed $vgpr10
; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr11
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; kill: killed $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; kill: killed $vgpr13
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; kill: killed $vgpr43
; SI-NEXT: ; implicit-def: $vgpr43
@@ -233773,10 +233773,11 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v63, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v62
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63
+; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v31
+; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v63
; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v61
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
; SI-NEXT: v_cvt_f32_f16_e32 v61, v59
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
@@ -233897,7 +233898,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -234138,9 +234138,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v63
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v62
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v31
; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v62
; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
@@ -234806,16 +234806,15 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76
; SI-NEXT: v_cvt_f16_f32_e32 v40, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v6
; SI-NEXT: v_mov_b32_e32 v46, v26
; SI-NEXT: v_cvt_f16_f32_e32 v43, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v59, v9
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
@@ -234828,588 +234827,599 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v23
; SI-NEXT: v_cvt_f16_f32_e32 v45, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v25
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v28, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_cvt_f16_f32_e32 v8, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v23, s17
; SI-NEXT: v_cvt_f16_f32_e32 v24, s18
-; SI-NEXT: v_cvt_f16_f32_e32 v25, s19
; SI-NEXT: v_cvt_f16_f32_e32 v29, s20
-; SI-NEXT: v_cvt_f16_f32_e32 v30, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v27, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v9, s21
+; SI-NEXT: v_cvt_f16_f32_e32 v27, s23
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v35
; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v35
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT: v_cvt_f16_f32_e32 v59, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v38
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v48
-; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v39
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v50
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
-; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v63, v63
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v50
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v23, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v38, s22
-; SI-NEXT: v_cvt_f16_f32_e32 v37, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v48, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v49, s26
-; SI-NEXT: v_cvt_f16_f32_e32 v35, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v50, s28
-; SI-NEXT: v_cvt_f16_f32_e32 v51, s29
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v25, s19
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v35, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v38, s25
+; SI-NEXT: v_cvt_f16_f32_e32 v48, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v49, s27
+; SI-NEXT: v_cvt_f16_f32_e32 v32, s28
+; SI-NEXT: v_cvt_f16_f32_e32 v39, s29
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB103_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v24, v19
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v60
+; SI-NEXT: v_mov_b32_e32 v39, v30
+; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v30
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43
-; SI-NEXT: v_mov_b32_e32 v43, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_mov_b32_e32 v50, v19
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v56
+; SI-NEXT: v_mov_b32_e32 v23, v17
+; SI-NEXT: v_mov_b32_e32 v25, v20
+; SI-NEXT: v_mov_b32_e32 v29, v21
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: v_mov_b32_e32 v51, v22
-; SI-NEXT: v_mov_b32_e32 v38, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_mov_b32_e32 v37, v45
-; SI-NEXT: v_mov_b32_e32 v27, v26
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26
-; SI-NEXT: v_mov_b32_e32 v49, v47
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22
+; SI-NEXT: v_mov_b32_e32 v48, v26
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26
+; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v45
+; SI-NEXT: v_mov_b32_e32 v49, v16
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v16
+; SI-NEXT: v_mov_b32_e32 v35, v46
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v57
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v46
+; SI-NEXT: v_mov_b32_e32 v43, v59
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v58
+; SI-NEXT: v_mov_b32_e32 v32, v28
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v28
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v34
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37
+; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v47
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v52
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54
-; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v54
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v41
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v62
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v50
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
-; SI-NEXT: v_mov_b32_e32 v57, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32
-; SI-NEXT: v_mov_b32_e32 v32, v7
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT: v_mov_b32_e32 v33, v12
-; SI-NEXT: v_mov_b32_e32 v34, v5
-; SI-NEXT: v_mov_b32_e32 v58, v7
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44
-; SI-NEXT: v_mov_b32_e32 v44, v18
-; SI-NEXT: v_mov_b32_e32 v5, v43
-; SI-NEXT: v_mov_b32_e32 v18, v6
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3
; SI-NEXT: s_branch .LBB103_3
; SI-NEXT: .LBB103_2:
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: v_mov_b32_e32 v39, v30
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: v_mov_b32_e32 v32, v28
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: v_mov_b32_e32 v35, v46
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: v_mov_b32_e32 v49, v16
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: v_mov_b32_e32 v48, v26
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: v_mov_b32_e32 v51, v22
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: v_mov_b32_e32 v29, v21
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: v_mov_b32_e32 v25, v20
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: v_mov_b32_e32 v24, v19
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: v_mov_b32_e32 v23, v17
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_mov_b32_e32 v49, v47
-; SI-NEXT: v_mov_b32_e32 v27, v26
-; SI-NEXT: v_mov_b32_e32 v37, v45
-; SI-NEXT: v_mov_b32_e32 v38, v16
-; SI-NEXT: v_mov_b32_e32 v51, v22
-; SI-NEXT: v_mov_b32_e32 v50, v19
+; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: v_mov_b32_e32 v5, v6
; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: v_mov_b32_e32 v43, v59
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v5, v57
+; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: .LBB103_3: ; %Flow
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v36, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v43, v9
-; SI-NEXT: v_mov_b32_e32 v12, v31
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v31, v11
-; SI-NEXT: v_mov_b32_e32 v9, v17
+; SI-NEXT: v_mov_b32_e32 v47, v49
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v50, v2
; SI-NEXT: s_cbranch_vccnz .LBB103_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v36
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v3
; SI-NEXT: v_cvt_f32_f16_e32 v10, v63
; SI-NEXT: v_cvt_f32_f16_e32 v14, v62
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v26, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT: v_mov_b32_e32 v1, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v24
+; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
+; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
+; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v25
+; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v29
+; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14
; SI-NEXT: v_cvt_f32_f16_e32 v14, v55
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v54
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v41
; SI-NEXT: v_cvt_f32_f16_e32 v8, v42
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v41
-; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15
-; SI-NEXT: v_mov_b32_e32 v6, v37
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v39
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10
; SI-NEXT: v_cvt_f32_f16_e32 v10, v52
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v51
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v60
-; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v50
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v13
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v5
+; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v53
; SI-NEXT: v_cvt_f32_f16_e32 v42, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v18
-; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v44
+; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v8
; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v49
-; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v2
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v37
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v23
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v47
+; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v38
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v46
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v34, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v36, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v51, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v40, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v44, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v46, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v57, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v2
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v26
; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v35
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v58, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v26, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v22, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v19, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v13, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v59, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59
; SI-NEXT: v_cvt_f16_f32_e32 v59, v59
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v61, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61
; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v62, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v63, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63
; SI-NEXT: v_cvt_f16_f32_e32 v63, v63
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -235455,14 +235465,22 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v57
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v22
; SI-NEXT: v_cvt_f16_f32_e32 v5, v26
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v17
; SI-NEXT: v_cvt_f16_f32_e32 v7, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v57
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v30
+; SI-NEXT: v_mov_b32_e32 v30, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -235486,7 +235504,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v44
; SI-NEXT: v_cvt_f16_f32_e32 v5, v43
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v42
; SI-NEXT: v_cvt_f16_f32_e32 v12, v41
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -235521,99 +235539,92 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_cvt_f16_f32_e32 v5, v33
; SI-NEXT: v_cvt_f16_f32_e32 v7, v32
; SI-NEXT: v_cvt_f16_f32_e32 v12, v31
-; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v28
; SI-NEXT: v_cvt_f16_f32_e32 v5, v21
; SI-NEXT: v_cvt_f16_f32_e32 v7, v11
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v6
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v16
+; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v14
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v17
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v52
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v23
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v15
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v48
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v24
-; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v14
-; SI-NEXT: v_mov_b32_e32 v16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v15
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v27
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v37
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v25
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v55
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v30
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v49
+; SI-NEXT: v_mov_b32_e32 v8, v13
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v53
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v54
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v55
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v48
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4
+; SI-NEXT: v_mov_b32_e32 v4, v18
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; SI-NEXT: v_mov_b32_e32 v4, v27
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3
-; SI-NEXT: v_mov_b32_e32 v3, v13
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2
; SI-NEXT: .LBB103_5: ; %end
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -235703,7 +235714,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -235764,17 +235775,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -235784,8 +235797,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -235794,7 +235807,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
@@ -235804,102 +235817,100 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -236251,17 +236262,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48
@@ -236280,169 +236291,170 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v11, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v13, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v9, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v50
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v44
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46
-; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49
-; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50
+; SI-NEXT: v_mul_f32_e32 v7, 1.0, v51
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40
; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v59
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v59
; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; kill: killed $vgpr45
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; kill: killed $vgpr45
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57
; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60
; SI-NEXT: v_mul_f32_e32 v57, 1.0, v62
; SI-NEXT: v_mul_f32_e32 v56, 1.0, v63
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v43
; SI-NEXT: ; kill: killed $vgpr45
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr41
@@ -236457,106 +236469,96 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; kill: killed $vgpr45
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_mul_f32_e32 v58, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v22
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v23
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v24
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:132
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v22
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; kill: killed $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB104_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
@@ -236565,14 +236567,10 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47
; SI-NEXT: ; kill: killed $vgpr1
@@ -236584,7 +236582,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
@@ -236592,7 +236590,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57
; SI-NEXT: ; kill: killed $vgpr1
@@ -236604,265 +236602,277 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v25
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v40
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v55
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v32
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v21
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v54
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v53
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11
-; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: .LBB104_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB104_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v33
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v43
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v28, v34, v28, 16
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v54
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34
+; SI-NEXT: v_alignbit_b32 v28, v35, v28, 16
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v31
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v32
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v53
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v34
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v51
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; SI-NEXT: v_alignbit_b32 v28, v32, v28, 16
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; SI-NEXT: v_alignbit_b32 v15, v22, v15, 16
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v53
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
+; SI-NEXT: v_alignbit_b32 v28, v32, v28, 16
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
-; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; SI-NEXT: v_alignbit_b32 v13, v28, v13, 16
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15
-; SI-NEXT: v_alignbit_b32 v13, v18, v13, 16
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; SI-NEXT: v_alignbit_b32 v13, v21, v13, 16
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13
+; SI-NEXT: v_alignbit_b32 v11, v20, v11, 16
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; SI-NEXT: v_alignbit_b32 v11, v19, v11, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_alignbit_b32 v10, v13, v10, 16
+; SI-NEXT: v_alignbit_b32 v11, v17, v11, 16
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16
+; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
@@ -236873,254 +236883,256 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; SI-NEXT: v_alignbit_b32 v4, v8, v4, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v16
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_alignbit_b32 v4, v10, v4, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v24
+; SI-NEXT: v_alignbit_b32 v4, v11, v4, 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v61, v1, v3, 16
+; SI-NEXT: v_alignbit_b32 v19, v1, v3, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v58
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v19, v10, v3, 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v27
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v11, v3, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v4, v13, v4, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v21, v11, v8, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_alignbit_b32 v4, v17, v4, 16
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v22
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v22, v11, v8, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v23, v11, v8, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v24
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v11, v8, 16
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v18, v16, v4, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_alignbit_b32 v45, v17, v4, 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16
; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16
-; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16
-; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v2, v22, v2, 16
+; SI-NEXT: v_alignbit_b32 v1, v19, v1, 16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v24, v10, v8, 16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v11, v8, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33
; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v35, v36, v11, 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16
-; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_alignbit_b32 v37, v38, v13, 16
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v10, v37, v10, 16
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v10, v35, v11, 16
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_alignbit_b32 v39, v48, v13, 16
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_alignbit_b32 v28, v43, v9, 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18
+; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v34
+; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16
; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v9, v28, v9, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_alignbit_b32 v51, v52, v14, 16
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31
; SI-NEXT: v_alignbit_b32 v14, v51, v14, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v21, v6, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16
+; SI-NEXT: v_alignbit_b32 v2, v20, v3, 16
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v45, v4, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: .LBB104_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237135,7 +237147,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237151,7 +237163,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237167,7 +237179,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237183,7 +237195,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237199,7 +237211,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237215,7 +237227,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237231,7 +237243,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237240,14 +237252,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237257,15 +237269,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237284,8 +237298,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237304,8 +237318,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237325,7 +237339,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237344,8 +237358,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237364,8 +237378,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237373,9 +237387,11 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -237383,7 +237399,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -237391,12 +237407,10 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -239757,282 +239771,892 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v63, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25
; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v13, 1.0, v28
; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28
-; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v29, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v15, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v24, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s29
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v37
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v49
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v50
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v51
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50
-; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v52
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v53
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v54
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17
+; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v40
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v41
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v39, 1.0, s17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v31, 1.0, s21
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT: s_cbranch_scc0 .LBB105_2
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: s_cbranch_scc0 .LBB105_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v5
-; SI-NEXT: v_mov_b32_e32 v42, v62
-; SI-NEXT: v_mov_b32_e32 v43, v63
-; SI-NEXT: v_mov_b32_e32 v55, v12
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25
-; SI-NEXT: v_mov_b32_e32 v25, v60
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v40, v20
-; SI-NEXT: v_mov_b32_e32 v51, v61
-; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46
-; SI-NEXT: v_mov_b32_e32 v29, v31
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v45
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v11, v5
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v8, v47
+; SI-NEXT: v_mov_b32_e32 v39, v13
+; SI-NEXT: v_mov_b32_e32 v31, v6
+; SI-NEXT: v_mov_b32_e32 v29, v61
; SI-NEXT: v_mov_b32_e32 v24, v56
+; SI-NEXT: v_mov_b32_e32 v38, v4
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v57
+; SI-NEXT: v_mov_b32_e32 v25, v60
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_mov_b32_e32 v52, v10
-; SI-NEXT: v_mov_b32_e32 v53, v59
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_mov_b32_e32 v55, v12
+; SI-NEXT: v_mov_b32_e32 v40, v58
+; SI-NEXT: v_mov_b32_e32 v52, v59
+; SI-NEXT: v_mov_b32_e32 v43, v37
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mov_b32_e32 v62, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v50, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: v_mov_b32_e32 v62, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v26
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v36
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v42, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v41, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v41, v1
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v47
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v63
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v61
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v56
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54
+; SI-NEXT: v_mov_b32_e32 v54, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53
+; SI-NEXT: v_mov_b32_e32 v53, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v58
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v35
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34
+; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33
; SI-NEXT: v_mov_b32_e32 v33, v34
+; SI-NEXT: v_mov_b32_e32 v34, v13
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; SI-NEXT: v_mov_b32_e32 v13, v4
+; SI-NEXT: s_cbranch_execnz .LBB105_3
+; SI-NEXT: .LBB105_2: ; %cmp.true
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v52
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
-; SI-NEXT: v_mov_b32_e32 v39, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11
+; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v29
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14
+; SI-NEXT: v_alignbit_b32 v1, v16, v1, 16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v37, v38
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
+; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v53, v20, v1, 16
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v39
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20
+; SI-NEXT: v_alignbit_b32 v54, v22, v1, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24
+; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29
+; SI-NEXT: v_alignbit_b32 v17, v27, v1, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v25
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v24
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31
+; SI-NEXT: v_alignbit_b32 v28, v28, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34
-; SI-NEXT: s_branch .LBB105_3
-; SI-NEXT: .LBB105_2:
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v55, v12
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32
+; SI-NEXT: v_alignbit_b32 v30, v30, v1, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32
+; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v31
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v26, v58, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v23, v27, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v21, v23, v21, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v56, v59, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_alignbit_b32 v12, v22, v1, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_alignbit_b32 v63, v1, v20, 16
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_alignbit_b32 v47, v63, v19, 16
+; SI-NEXT: v_alignbit_b32 v19, v56, v13, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v24
+; SI-NEXT: v_alignbit_b32 v60, v18, v20, 16
+; SI-NEXT: v_alignbit_b32 v16, v60, v16, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v29
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v61, v7, v25, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v57, v7, v31, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v31
+; SI-NEXT: v_alignbit_b32 v24, v46, v11, 16
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11
+; SI-NEXT: v_alignbit_b32 v25, v45, v9, 16
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v44, v15, v3, 16
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT: v_alignbit_b32 v9, v3, v9, 16
+; SI-NEXT: v_alignbit_b32 v2, v20, v2, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; SI-NEXT: v_alignbit_b32 v50, v51, v39, 16
+; SI-NEXT: v_alignbit_b32 v4, v50, v4, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v44, v38, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v25, v37, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v24, v34, 16
+; SI-NEXT: v_mov_b32_e32 v7, v25
+; SI-NEXT: v_mov_b32_e32 v37, v17
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v57, v33, 16
+; SI-NEXT: v_alignbit_b32 v6, v61, v14, 16
+; SI-NEXT: v_alignbit_b32 v17, v12, v32, 16
+; SI-NEXT: v_alignbit_b32 v14, v26, v10, 16
+; SI-NEXT: v_alignbit_b32 v32, v48, v8, 16
+; SI-NEXT: v_alignbit_b32 v5, v36, v52, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: .LBB105_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v50
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51
+; SI-NEXT: v_or_b32_e32 v8, v8, v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v8, v4
+; SI-NEXT: v_add_i32_e32 v8, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; SI-NEXT: v_or_b32_e32 v4, v4, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20
+; SI-NEXT: v_or_b32_e32 v4, v4, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v44
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v57
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v61
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v63
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: s_setpc_b64 s[30:31]
+; SI-NEXT: .LBB105_4:
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v39, v13
; SI-NEXT: v_mov_b32_e32 v33, v34
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -240064,695 +240688,73 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v5
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: v_mov_b32_e32 v51, v61
-; SI-NEXT: v_mov_b32_e32 v42, v62
-; SI-NEXT: v_mov_b32_e32 v29, v31
+; SI-NEXT: v_mov_b32_e32 v11, v5
+; SI-NEXT: v_mov_b32_e32 v31, v6
+; SI-NEXT: v_mov_b32_e32 v29, v61
; SI-NEXT: v_mov_b32_e32 v25, v60
; SI-NEXT: v_mov_b32_e32 v24, v56
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v40, v20
-; SI-NEXT: v_mov_b32_e32 v43, v63
-; SI-NEXT: v_mov_b32_e32 v52, v10
-; SI-NEXT: v_mov_b32_e32 v53, v59
-; SI-NEXT: v_mov_b32_e32 v39, v4
-; SI-NEXT: v_mov_b32_e32 v37, v38
-; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v8, v47
+; SI-NEXT: v_mov_b32_e32 v55, v12
+; SI-NEXT: v_mov_b32_e32 v40, v58
+; SI-NEXT: v_mov_b32_e32 v52, v59
+; SI-NEXT: v_mov_b32_e32 v38, v4
+; SI-NEXT: v_mov_b32_e32 v43, v37
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr5
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: .LBB105_3: ; %Flow
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: s_cbranch_vccnz .LBB105_5
-; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11
-; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14
-; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
-; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20
-; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32
-; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16
-; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24
-; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16
-; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29
-; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9
-; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15
-; SI-NEXT: v_mov_b32_e32 v15, v24
-; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16
-; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16
-; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16
-; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16
-; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v20, v52
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16
-; SI-NEXT: v_mov_b32_e32 v14, v51
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: .LBB105_5: ; %end
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_or_b32_e32 v4, v7, v4
-; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11
-; SI-NEXT: v_or_b32_e32 v4, v4, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; SI-NEXT: v_or_b32_e32 v4, v4, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT: s_setpc_b64 s[30:31]
+; SI-NEXT: s_branch .LBB105_2
;
; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar:
; VI: ; %bb.0:
@@ -243834,50 +243836,50 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_mov_b32 s60, s16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s17, 0
-; SI-NEXT: s_mov_b32 s61, s19
; SI-NEXT: v_writelane_b32 v41, s60, 1
-; SI-NEXT: s_mov_b32 s63, s18
-; SI-NEXT: v_writelane_b32 v41, s61, 2
+; SI-NEXT: s_mov_b32 s61, s18
+; SI-NEXT: v_writelane_b32 v41, s19, 2
; SI-NEXT: s_mov_b32 s72, s21
-; SI-NEXT: v_writelane_b32 v41, s63, 3
+; SI-NEXT: v_writelane_b32 v41, s61, 3
; SI-NEXT: v_writelane_b32 v41, s72, 4
; SI-NEXT: s_mov_b32 s74, s23
; SI-NEXT: v_writelane_b32 v41, s20, 5
; SI-NEXT: v_writelane_b32 v41, s74, 6
-; SI-NEXT: s_mov_b32 s75, s25
+; SI-NEXT: s_mov_b32 s76, s25
; SI-NEXT: v_writelane_b32 v41, s22, 7
-; SI-NEXT: v_writelane_b32 v41, s75, 8
-; SI-NEXT: s_mov_b32 s76, s27
+; SI-NEXT: v_writelane_b32 v41, s76, 8
+; SI-NEXT: s_mov_b32 s78, s27
; SI-NEXT: v_writelane_b32 v41, s24, 9
-; SI-NEXT: v_writelane_b32 v41, s76, 10
-; SI-NEXT: s_mov_b32 s93, s29
+; SI-NEXT: v_writelane_b32 v41, s78, 10
+; SI-NEXT: s_mov_b32 s79, s29
; SI-NEXT: v_writelane_b32 v41, s26, 11
-; SI-NEXT: v_writelane_b32 v41, s93, 12
-; SI-NEXT: v_readfirstlane_b32 s16, v2
+; SI-NEXT: v_writelane_b32 v41, s79, 12
+; SI-NEXT: v_readfirstlane_b32 s6, v2
; SI-NEXT: v_writelane_b32 v41, s28, 13
; SI-NEXT: v_readfirstlane_b32 s73, v4
-; SI-NEXT: v_writelane_b32 v41, s16, 14
-; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: v_writelane_b32 v41, s6, 14
+; SI-NEXT: v_readfirstlane_b32 s95, v3
; SI-NEXT: v_writelane_b32 v41, s73, 15
-; SI-NEXT: v_readfirstlane_b32 s90, v6
-; SI-NEXT: v_writelane_b32 v41, s89, 16
-; SI-NEXT: v_readfirstlane_b32 s91, v5
-; SI-NEXT: v_writelane_b32 v41, s90, 17
-; SI-NEXT: v_readfirstlane_b32 s34, v8
-; SI-NEXT: v_writelane_b32 v41, s91, 18
-; SI-NEXT: v_readfirstlane_b32 s35, v7
-; SI-NEXT: v_writelane_b32 v41, s34, 19
-; SI-NEXT: v_readfirstlane_b32 s36, v10
-; SI-NEXT: v_writelane_b32 v41, s35, 20
+; SI-NEXT: v_readfirstlane_b32 s36, v6
+; SI-NEXT: v_writelane_b32 v41, s95, 16
+; SI-NEXT: v_readfirstlane_b32 s30, v5
+; SI-NEXT: v_writelane_b32 v41, s36, 17
+; SI-NEXT: v_readfirstlane_b32 s37, v8
+; SI-NEXT: v_writelane_b32 v41, s30, 18
+; SI-NEXT: v_readfirstlane_b32 s38, v7
+; SI-NEXT: v_writelane_b32 v41, s37, 19
+; SI-NEXT: v_readfirstlane_b32 s39, v10
+; SI-NEXT: v_writelane_b32 v41, s38, 20
; SI-NEXT: v_writelane_b32 v40, s96, 32
-; SI-NEXT: v_readfirstlane_b32 s37, v9
-; SI-NEXT: v_writelane_b32 v41, s36, 21
+; SI-NEXT: v_readfirstlane_b32 s48, v9
+; SI-NEXT: v_writelane_b32 v41, s39, 21
+; SI-NEXT: v_writelane_b32 v40, s97, 33
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s62, v31
+; SI-NEXT: v_readfirstlane_b32 s92, v31
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s80, v32
+; SI-NEXT: v_readfirstlane_b32 s93, v32
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s69, v33
+; SI-NEXT: v_readfirstlane_b32 s90, v33
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
@@ -243889,20 +243891,19 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s84, v34
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s68, v35
+; SI-NEXT: v_readfirstlane_b32 s94, v35
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s83, v36
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s87, v38
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80
-; SI-NEXT: v_readfirstlane_b32 s6, v37
+; SI-NEXT: v_readfirstlane_b32 s91, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
-; SI-NEXT: v_writelane_b32 v40, s97, 33
-; SI-NEXT: v_readfirstlane_b32 s38, v12
-; SI-NEXT: v_writelane_b32 v41, s37, 22
+; SI-NEXT: v_readfirstlane_b32 s49, v12
+; SI-NEXT: v_writelane_b32 v41, s48, 22
; SI-NEXT: v_writelane_b32 v40, s98, 34
; SI-NEXT: v_readfirstlane_b32 s14, v30
; SI-NEXT: v_readfirstlane_b32 s15, v29
@@ -243912,21 +243913,21 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_readfirstlane_b32 s11, v25
; SI-NEXT: v_readfirstlane_b32 s8, v24
; SI-NEXT: v_readfirstlane_b32 s9, v23
-; SI-NEXT: v_readfirstlane_b32 s88, v22
-; SI-NEXT: v_readfirstlane_b32 s29, v21
-; SI-NEXT: v_readfirstlane_b32 s79, v20
-; SI-NEXT: v_readfirstlane_b32 s27, v19
-; SI-NEXT: v_readfirstlane_b32 s78, v18
-; SI-NEXT: v_readfirstlane_b32 s25, v17
-; SI-NEXT: v_readfirstlane_b32 s77, v16
-; SI-NEXT: v_readfirstlane_b32 s23, v15
-; SI-NEXT: v_readfirstlane_b32 s39, v14
-; SI-NEXT: v_readfirstlane_b32 s21, v13
-; SI-NEXT: v_readfirstlane_b32 s19, v11
+; SI-NEXT: v_readfirstlane_b32 s89, v22
+; SI-NEXT: v_readfirstlane_b32 s7, v21
+; SI-NEXT: v_readfirstlane_b32 s88, v20
+; SI-NEXT: v_readfirstlane_b32 s29, v19
+; SI-NEXT: v_readfirstlane_b32 s77, v18
+; SI-NEXT: v_readfirstlane_b32 s27, v17
+; SI-NEXT: v_readfirstlane_b32 s75, v16
+; SI-NEXT: v_readfirstlane_b32 s25, v15
+; SI-NEXT: v_readfirstlane_b32 s50, v14
+; SI-NEXT: v_readfirstlane_b32 s23, v13
+; SI-NEXT: v_readfirstlane_b32 s21, v11
; SI-NEXT: v_readfirstlane_b32 s18, v1
-; SI-NEXT: v_writelane_b32 v41, s38, 23
+; SI-NEXT: v_writelane_b32 v41, s49, 23
; SI-NEXT: v_writelane_b32 v40, s99, 35
-; SI-NEXT: v_writelane_b32 v41, s39, 24
+; SI-NEXT: v_writelane_b32 v41, s50, 24
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s58, v31
; SI-NEXT: s_waitcnt vmcnt(11)
@@ -243955,165 +243956,166 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB107_2
; SI-NEXT: ; %bb.1: ; %cmp.false
+; SI-NEXT: s_lshl_b32 s5, s17, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 25
+; SI-NEXT: s_lshl_b32 s5, s61, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 26
+; SI-NEXT: s_lshl_b32 s5, s20, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 27
+; SI-NEXT: s_lshl_b32 s5, s22, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 28
+; SI-NEXT: s_lshl_b32 s5, s24, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 29
+; SI-NEXT: s_lshl_b32 s5, s26, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 30
+; SI-NEXT: s_lshl_b32 s5, s28, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 31
+; SI-NEXT: s_lshl_b32 s5, s18, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 32
+; SI-NEXT: s_lshl_b32 s5, s95, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 33
+; SI-NEXT: s_lshl_b32 s5, s38, 16
; SI-NEXT: s_lshl_b32 s4, s60, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 25
-; SI-NEXT: s_lshl_b32 s4, s63, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 26
-; SI-NEXT: s_lshl_b32 s4, s20, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 27
-; SI-NEXT: s_lshl_b32 s4, s22, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 28
-; SI-NEXT: s_lshl_b32 s4, s24, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 29
-; SI-NEXT: s_lshl_b32 s4, s26, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 30
-; SI-NEXT: s_lshl_b32 s4, s28, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 31
-; SI-NEXT: s_lshl_b32 s4, s18, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 32
-; SI-NEXT: s_lshl_b32 s4, s89, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 33
-; SI-NEXT: s_lshl_b32 s4, s91, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 34
-; SI-NEXT: s_lshl_b32 s4, s35, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 35
-; SI-NEXT: s_lshl_b32 s4, s37, 16
-; SI-NEXT: s_lshl_b32 s7, s17, 16
-; SI-NEXT: s_lshl_b32 s96, s61, 16
-; SI-NEXT: s_lshl_b32 s99, s72, 16
-; SI-NEXT: s_lshl_b32 s97, s74, 16
-; SI-NEXT: s_lshl_b32 s92, s75, 16
-; SI-NEXT: s_lshl_b32 s94, s76, 16
-; SI-NEXT: s_lshl_b32 s95, s93, 16
-; SI-NEXT: s_lshl_b32 s93, s16, 16
-; SI-NEXT: s_lshl_b32 s30, s73, 16
-; SI-NEXT: s_lshl_b32 s31, s90, 16
-; SI-NEXT: s_lshl_b32 s34, s34, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 36
-; SI-NEXT: s_lshl_b32 s35, s36, 16
-; SI-NEXT: s_lshl_b32 s86, s19, 16
-; SI-NEXT: s_lshl_b32 s36, s38, 16
-; SI-NEXT: s_lshl_b32 s22, s21, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 34
+; SI-NEXT: s_lshl_b32 s5, s48, 16
+; SI-NEXT: s_lshl_b32 s16, s19, 16
+; SI-NEXT: s_lshl_b32 s63, s72, 16
+; SI-NEXT: s_lshl_b32 s98, s74, 16
+; SI-NEXT: s_lshl_b32 s62, s76, 16
+; SI-NEXT: s_lshl_b32 s96, s78, 16
+; SI-NEXT: s_lshl_b32 s31, s79, 16
+; SI-NEXT: s_lshl_b32 s34, s6, 16
+; SI-NEXT: s_lshl_b32 s35, s73, 16
+; SI-NEXT: s_lshl_b32 s30, s30, 16
+; SI-NEXT: s_lshl_b32 s36, s36, 16
+; SI-NEXT: s_lshl_b32 s99, s37, 16
+; SI-NEXT: v_writelane_b32 v41, s5, 35
; SI-NEXT: s_lshl_b32 s37, s39, 16
-; SI-NEXT: s_lshl_b32 s24, s23, 16
-; SI-NEXT: s_lshl_b32 s38, s77, 16
-; SI-NEXT: s_lshl_b32 s28, s25, 16
-; SI-NEXT: s_lshl_b32 s39, s78, 16
-; SI-NEXT: s_lshl_b32 s61, s27, 16
-; SI-NEXT: s_lshl_b32 s48, s79, 16
-; SI-NEXT: s_lshl_b32 s89, s29, 16
-; SI-NEXT: s_lshl_b32 s49, s88, 16
-; SI-NEXT: s_lshl_b32 s60, s9, 16
-; SI-NEXT: s_lshl_b32 s50, s8, 16
-; SI-NEXT: s_lshl_b32 s90, s11, 16
-; SI-NEXT: s_lshl_b32 s91, s10, 16
-; SI-NEXT: s_lshl_b32 s70, s13, 16
-; SI-NEXT: s_lshl_b32 s51, s12, 16
-; SI-NEXT: s_lshl_b32 s71, s15, 16
-; SI-NEXT: s_lshl_b32 s52, s14, 16
-; SI-NEXT: s_lshl_b32 s20, s41, 16
-; SI-NEXT: s_lshl_b32 s53, s40, 16
-; SI-NEXT: s_lshl_b32 s81, s43, 16
-; SI-NEXT: s_lshl_b32 s54, s42, 16
-; SI-NEXT: s_lshl_b32 s63, s45, 16
-; SI-NEXT: s_lshl_b32 s55, s44, 16
-; SI-NEXT: s_lshl_b32 s72, s47, 16
-; SI-NEXT: s_lshl_b32 s64, s46, 16
-; SI-NEXT: s_lshl_b32 s82, s57, 16
-; SI-NEXT: s_lshl_b32 s65, s56, 16
-; SI-NEXT: s_lshl_b32 s74, s59, 16
-; SI-NEXT: s_lshl_b32 s66, s58, 16
-; SI-NEXT: s_lshl_b32 s75, s87, 16
-; SI-NEXT: s_mov_b32 s73, s6
-; SI-NEXT: s_lshl_b32 s67, s6, 16
-; SI-NEXT: s_lshl_b32 s76, s83, 16
-; SI-NEXT: s_mov_b32 s16, s68
-; SI-NEXT: s_lshl_b32 s68, s68, 16
-; SI-NEXT: s_lshl_b32 s85, s84, 16
-; SI-NEXT: s_mov_b32 s98, s69
-; SI-NEXT: s_lshl_b32 s69, s69, 16
-; SI-NEXT: s_lshl_b32 s17, s80, 16
-; SI-NEXT: s_mov_b32 s6, s62
-; SI-NEXT: s_lshl_b32 s26, s62, 16
+; SI-NEXT: s_lshl_b32 s19, s21, 16
+; SI-NEXT: s_lshl_b32 s38, s49, 16
+; SI-NEXT: s_lshl_b32 s20, s23, 16
+; SI-NEXT: s_lshl_b32 s39, s50, 16
+; SI-NEXT: s_lshl_b32 s22, s25, 16
+; SI-NEXT: s_lshl_b32 s48, s75, 16
+; SI-NEXT: s_lshl_b32 s60, s27, 16
+; SI-NEXT: s_lshl_b32 s49, s77, 16
+; SI-NEXT: s_lshl_b32 s24, s29, 16
+; SI-NEXT: s_lshl_b32 s50, s88, 16
+; SI-NEXT: s_lshl_b32 s61, s7, 16
+; SI-NEXT: s_lshl_b32 s51, s89, 16
+; SI-NEXT: s_lshl_b32 s28, s9, 16
+; SI-NEXT: s_lshl_b32 s52, s8, 16
+; SI-NEXT: s_lshl_b32 s72, s11, 16
+; SI-NEXT: s_lshl_b32 s53, s10, 16
+; SI-NEXT: s_lshl_b32 s74, s13, 16
+; SI-NEXT: s_lshl_b32 s54, s12, 16
+; SI-NEXT: s_lshl_b32 s95, s15, 16
+; SI-NEXT: s_lshl_b32 s55, s14, 16
+; SI-NEXT: s_lshl_b32 s81, s41, 16
+; SI-NEXT: s_lshl_b32 s64, s40, 16
+; SI-NEXT: s_lshl_b32 s82, s43, 16
+; SI-NEXT: s_lshl_b32 s65, s42, 16
+; SI-NEXT: s_lshl_b32 s85, s45, 16
+; SI-NEXT: s_lshl_b32 s66, s44, 16
+; SI-NEXT: s_lshl_b32 s86, s47, 16
+; SI-NEXT: s_lshl_b32 s67, s46, 16
+; SI-NEXT: s_lshl_b32 s76, s57, 16
+; SI-NEXT: s_lshl_b32 s68, s56, 16
+; SI-NEXT: s_lshl_b32 s97, s59, 16
+; SI-NEXT: s_lshl_b32 s69, s58, 16
+; SI-NEXT: s_lshl_b32 s78, s87, 16
+; SI-NEXT: s_mov_b32 s6, s91
+; SI-NEXT: s_lshl_b32 s70, s91, 16
+; SI-NEXT: s_lshl_b32 s79, s83, 16
+; SI-NEXT: s_mov_b32 s73, s94
+; SI-NEXT: s_lshl_b32 s71, s94, 16
+; SI-NEXT: s_lshl_b32 s26, s84, 16
+; SI-NEXT: s_mov_b32 s91, s90
+; SI-NEXT: s_lshl_b32 s90, s90, 16
+; SI-NEXT: s_mov_b32 s94, s93
+; SI-NEXT: s_lshl_b32 s17, s93, 16
+; SI-NEXT: s_mov_b32 s93, s92
+; SI-NEXT: s_lshl_b32 s80, s92, 16
+; SI-NEXT: s_mov_b32 s92, s4
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB107_3
; SI-NEXT: .LBB107_2:
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s16, s68
+; SI-NEXT: s_mov_b32 s73, s94
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s73, s6
+; SI-NEXT: s_mov_b32 s6, s91
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s6, s62
+; SI-NEXT: s_mov_b32 s94, s93
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s98, s69
+; SI-NEXT: s_mov_b32 s93, s92
+; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; kill: killed $sgpr17
+; SI-NEXT: s_mov_b32 s91, s90
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr7
-; SI-NEXT: ; implicit-def: $sgpr96
-; SI-NEXT: ; implicit-def: $sgpr99
-; SI-NEXT: ; implicit-def: $sgpr97
+; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr94
-; SI-NEXT: ; implicit-def: $sgpr95
-; SI-NEXT: ; implicit-def: $sgpr93
-; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; kill: killed $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; kill: killed $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr63
+; SI-NEXT: ; implicit-def: $sgpr98
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr96
; SI-NEXT: ; implicit-def: $sgpr31
; SI-NEXT: ; implicit-def: $sgpr34
; SI-NEXT: ; implicit-def: $sgpr35
-; SI-NEXT: ; implicit-def: $sgpr86
+; SI-NEXT: ; implicit-def: $sgpr30
; SI-NEXT: ; implicit-def: $sgpr36
-; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr99
; SI-NEXT: ; implicit-def: $sgpr37
-; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr19
; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr20
; SI-NEXT: ; implicit-def: $sgpr39
-; SI-NEXT: ; implicit-def: $sgpr61
+; SI-NEXT: ; implicit-def: $sgpr22
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr49
; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr49
+; SI-NEXT: ; implicit-def: $sgpr24
; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr91
-; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr61
; SI-NEXT: ; implicit-def: $sgpr51
-; SI-NEXT: ; implicit-def: $sgpr71
+; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: ; implicit-def: $sgpr52
-; SI-NEXT: ; implicit-def: $sgpr20
+; SI-NEXT: ; implicit-def: $sgpr72
; SI-NEXT: ; implicit-def: $sgpr53
-; SI-NEXT: ; implicit-def: $sgpr81
+; SI-NEXT: ; implicit-def: $sgpr74
; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr63
+; SI-NEXT: ; implicit-def: $sgpr95
; SI-NEXT: ; implicit-def: $sgpr55
-; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr81
; SI-NEXT: ; implicit-def: $sgpr64
; SI-NEXT: ; implicit-def: $sgpr82
; SI-NEXT: ; implicit-def: $sgpr65
-; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr85
; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr75
+; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr67
; SI-NEXT: ; implicit-def: $sgpr76
; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr85
+; SI-NEXT: ; implicit-def: $sgpr97
; SI-NEXT: ; implicit-def: $sgpr69
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr79
+; SI-NEXT: ; implicit-def: $sgpr71
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr80
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr17
; SI-NEXT: ; implicit-def: $sgpr17
@@ -244121,86 +244123,97 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: .LBB107_3: ; %Flow
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: s_mov_b32 s5, s17
-; SI-NEXT: s_mov_b32 s17, s86
-; SI-NEXT: s_mov_b32 s86, s7
+; SI-NEXT: s_mov_b32 s4, s17
+; SI-NEXT: s_mov_b32 s17, s30
+; SI-NEXT: s_mov_b32 s30, s99
+; SI-NEXT: s_mov_b32 s99, s16
; SI-NEXT: s_cbranch_vccnz .LBB107_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_lshl_b32 s5, s6, 16
+; SI-NEXT: s_add_i32 s4, s94, 3
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: s_lshl_b32 s5, s93, 16
+; SI-NEXT: s_add_i32 s84, s84, 3
+; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_and_b32 s5, s84, 0xffff
+; SI-NEXT: s_lshl_b32 s60, s91, 16
+; SI-NEXT: s_add_i32 s83, s83, 3
+; SI-NEXT: s_or_b32 s5, s60, s5
+; SI-NEXT: s_and_b32 s60, s83, 0xffff
+; SI-NEXT: s_lshl_b32 s61, s73, 16
+; SI-NEXT: s_or_b32 s79, s61, s60
+; SI-NEXT: s_lshl_b32 s61, s6, 16
+; SI-NEXT: s_add_i32 s9, s9, 3
; SI-NEXT: v_readlane_b32 s6, v41, 24
+; SI-NEXT: s_add_i32 s11, s11, 3
+; SI-NEXT: s_and_b32 s9, s9, 0xffff
+; SI-NEXT: s_lshl_b32 s8, s8, 16
+; SI-NEXT: s_add_i32 s7, s7, 3
; SI-NEXT: s_lshl_b32 s20, s6, 16
; SI-NEXT: v_readlane_b32 s6, v41, 23
-; SI-NEXT: s_lshl_b32 s17, s6, 16
-; SI-NEXT: v_readlane_b32 s6, v41, 22
-; SI-NEXT: s_lshl_b32 s61, s16, 16
-; SI-NEXT: s_add_i32 s16, s6, 3
-; SI-NEXT: v_readlane_b32 s6, v41, 21
-; SI-NEXT: s_and_b32 s16, s16, 0xffff
-; SI-NEXT: s_lshl_b32 s7, s6, 16
-; SI-NEXT: v_readlane_b32 s6, v41, 20
-; SI-NEXT: s_or_b32 s7, s7, s16
-; SI-NEXT: s_add_i32 s6, s6, 3
-; SI-NEXT: v_readlane_b32 s16, v41, 19
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: s_and_b32 s6, s6, 0xffff
-; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_and_b32 s19, s19, 0xffff
-; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: v_readlane_b32 s16, v41, 18
-; SI-NEXT: s_lshl_b32 s60, s98, 16
-; SI-NEXT: s_or_b32 s17, s17, s19
-; SI-NEXT: s_add_i32 s98, s16, 3
-; SI-NEXT: v_readlane_b32 s19, v41, 17
-; SI-NEXT: s_add_i32 s21, s21, 3
-; SI-NEXT: s_and_b32 s16, s98, 0xffff
-; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_add_i32 s11, s11, 3
-; SI-NEXT: s_add_i32 s9, s9, 3
-; SI-NEXT: s_and_b32 s21, s21, 0xffff
-; SI-NEXT: s_or_b32 s16, s19, s16
-; SI-NEXT: v_readlane_b32 s19, v41, 16
; SI-NEXT: s_add_i32 s13, s13, 3
; SI-NEXT: s_and_b32 s11, s11, 0xffff
; SI-NEXT: s_lshl_b32 s10, s10, 16
-; SI-NEXT: s_and_b32 s9, s9, 0xffff
-; SI-NEXT: s_lshl_b32 s8, s8, 16
+; SI-NEXT: s_or_b32 s8, s8, s9
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_lshl_b32 s9, s89, 16
; SI-NEXT: s_add_i32 s29, s29, 3
-; SI-NEXT: s_or_b32 s20, s20, s21
-; SI-NEXT: s_add_i32 s96, s19, 3
-; SI-NEXT: v_readlane_b32 s21, v41, 15
+; SI-NEXT: s_lshl_b32 s19, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 22
; SI-NEXT: s_add_i32 s15, s15, 3
; SI-NEXT: s_and_b32 s13, s13, 0xffff
; SI-NEXT: s_lshl_b32 s12, s12, 16
; SI-NEXT: s_or_b32 s10, s10, s11
-; SI-NEXT: s_or_b32 s8, s8, s9
+; SI-NEXT: s_or_b32 s7, s9, s7
; SI-NEXT: s_and_b32 s9, s29, 0xffff
; SI-NEXT: s_lshl_b32 s11, s88, 16
; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_and_b32 s19, s96, 0xffff
-; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_add_i32 s16, s6, 3
+; SI-NEXT: v_readlane_b32 s6, v41, 21
; SI-NEXT: s_and_b32 s15, s15, 0xffff
; SI-NEXT: s_lshl_b32 s14, s14, 16
; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: s_or_b32 s9, s11, s9
; SI-NEXT: s_and_b32 s11, s27, 0xffff
-; SI-NEXT: s_lshl_b32 s13, s79, 16
+; SI-NEXT: s_lshl_b32 s13, s77, 16
; SI-NEXT: s_add_i32 s25, s25, 3
-; SI-NEXT: s_or_b32 s19, s21, s19
-; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: v_readlane_b32 s21, v41, 14
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
+; SI-NEXT: s_lshl_b32 s17, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 20
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: s_or_b32 s11, s13, s11
; SI-NEXT: s_and_b32 s13, s25, 0xffff
-; SI-NEXT: s_lshl_b32 s15, s78, 16
+; SI-NEXT: s_lshl_b32 s15, s75, 16
; SI-NEXT: s_add_i32 s23, s23, 3
-; SI-NEXT: s_and_b32 s18, s18, 0xffff
-; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_or_b32 s16, s17, s16
+; SI-NEXT: s_add_i32 s6, s6, 3
+; SI-NEXT: v_readlane_b32 s17, v41, 19
; SI-NEXT: s_or_b32 s13, s15, s13
; SI-NEXT: s_and_b32 s15, s23, 0xffff
-; SI-NEXT: s_lshl_b32 s22, s77, 16
+; SI-NEXT: s_add_i32 s21, s21, 3
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_or_b32 s15, s20, s15
+; SI-NEXT: s_and_b32 s20, s21, 0xffff
+; SI-NEXT: s_or_b32 s6, s17, s6
+; SI-NEXT: v_readlane_b32 s17, v41, 18
+; SI-NEXT: s_or_b32 s19, s19, s20
+; SI-NEXT: s_add_i32 s98, s17, 3
+; SI-NEXT: v_readlane_b32 s20, v41, 17
+; SI-NEXT: s_and_b32 s17, s98, 0xffff
+; SI-NEXT: s_lshl_b32 s20, s20, 16
+; SI-NEXT: s_or_b32 s17, s20, s17
+; SI-NEXT: v_readlane_b32 s20, v41, 16
+; SI-NEXT: s_add_i32 s96, s20, 3
+; SI-NEXT: v_readlane_b32 s21, v41, 15
+; SI-NEXT: s_and_b32 s20, s96, 0xffff
+; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_or_b32 s20, s21, s20
+; SI-NEXT: s_add_i32 s18, s18, 3
+; SI-NEXT: v_readlane_b32 s21, v41, 14
+; SI-NEXT: s_and_b32 s18, s18, 0xffff
+; SI-NEXT: s_lshl_b32 s21, s21, 16
; SI-NEXT: s_or_b32 s18, s21, s18
; SI-NEXT: v_readlane_b32 s21, v41, 13
-; SI-NEXT: s_or_b32 s15, s22, s15
; SI-NEXT: s_add_i32 s21, s21, 3
; SI-NEXT: v_readlane_b32 s22, v41, 12
; SI-NEXT: s_and_b32 s21, s21, 0xffff
@@ -244244,40 +244257,27 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_or_b32 s27, s28, s27
; SI-NEXT: s_add_i32 s27, s27, 0x30000
; SI-NEXT: s_add_i32 s26, s26, 0x30000
-; SI-NEXT: s_and_b32 s86, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s27, s27, 16
+; SI-NEXT: s_and_b32 s28, s27, 0xffff0000
; SI-NEXT: s_add_i32 s25, s25, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s27, 25
-; SI-NEXT: s_and_b32 s96, s26, 0xffff0000
+; SI-NEXT: v_writelane_b32 v41, s28, 25
+; SI-NEXT: s_and_b32 s99, s26, 0xffff0000
; SI-NEXT: s_lshl_b32 s26, s26, 16
; SI-NEXT: s_add_i32 s24, s24, 0x30000
; SI-NEXT: v_writelane_b32 v41, s26, 26
-; SI-NEXT: s_and_b32 s99, s25, 0xffff0000
+; SI-NEXT: s_and_b32 s63, s25, 0xffff0000
; SI-NEXT: s_lshl_b32 s25, s25, 16
; SI-NEXT: s_add_i32 s23, s23, 0x30000
; SI-NEXT: v_writelane_b32 v41, s25, 27
-; SI-NEXT: s_and_b32 s97, s24, 0xffff0000
+; SI-NEXT: s_and_b32 s98, s24, 0xffff0000
; SI-NEXT: s_lshl_b32 s24, s24, 16
-; SI-NEXT: s_add_i32 s80, s80, 3
; SI-NEXT: s_add_i32 s22, s22, 0x30000
; SI-NEXT: v_writelane_b32 v41, s24, 28
-; SI-NEXT: s_and_b32 s92, s23, 0xffff0000
+; SI-NEXT: s_and_b32 s62, s23, 0xffff0000
; SI-NEXT: s_lshl_b32 s23, s23, 16
-; SI-NEXT: s_and_b32 s4, s80, 0xffff
-; SI-NEXT: s_add_i32 s84, s84, 3
; SI-NEXT: s_add_i32 s21, s21, 0x30000
; SI-NEXT: v_writelane_b32 v41, s23, 29
-; SI-NEXT: s_and_b32 s94, s22, 0xffff0000
+; SI-NEXT: s_and_b32 s96, s22, 0xffff0000
; SI-NEXT: s_lshl_b32 s22, s22, 16
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_and_b32 s5, s84, 0xffff
-; SI-NEXT: s_add_i32 s83, s83, 3
-; SI-NEXT: s_add_i32 s18, s18, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s22, 30
-; SI-NEXT: s_and_b32 s95, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s21, s21, 16
-; SI-NEXT: s_or_b32 s5, s60, s5
-; SI-NEXT: s_and_b32 s60, s83, 0xffff
; SI-NEXT: s_add_i32 s87, s87, 3
; SI-NEXT: s_add_i32 s59, s59, 3
; SI-NEXT: s_add_i32 s57, s57, 3
@@ -244285,13 +244285,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_add_i32 s45, s45, 3
; SI-NEXT: s_add_i32 s43, s43, 3
; SI-NEXT: s_add_i32 s41, s41, 3
-; SI-NEXT: s_add_i32 s19, s19, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s21, 31
-; SI-NEXT: s_and_b32 s93, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s18, s18, 16
-; SI-NEXT: s_or_b32 s76, s61, s60
+; SI-NEXT: s_add_i32 s18, s18, 0x30000
+; SI-NEXT: v_writelane_b32 v41, s22, 30
+; SI-NEXT: s_and_b32 s31, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s21, s21, 16
; SI-NEXT: s_and_b32 s60, s87, 0xffff
-; SI-NEXT: s_lshl_b32 s61, s73, 16
; SI-NEXT: s_and_b32 s59, s59, 0xffff
; SI-NEXT: s_lshl_b32 s58, s58, 16
; SI-NEXT: s_and_b32 s57, s57, 0xffff
@@ -244304,10 +244302,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_lshl_b32 s42, s42, 16
; SI-NEXT: s_and_b32 s41, s41, 0xffff
; SI-NEXT: s_lshl_b32 s40, s40, 16
-; SI-NEXT: s_add_i32 s16, s16, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s18, 32
-; SI-NEXT: s_lshl_b32 s18, s19, 16
-; SI-NEXT: s_or_b32 s75, s61, s60
+; SI-NEXT: s_add_i32 s20, s20, 0x30000
+; SI-NEXT: v_writelane_b32 v41, s21, 31
+; SI-NEXT: s_and_b32 s34, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s18, s18, 16
+; SI-NEXT: s_or_b32 s78, s61, s60
; SI-NEXT: s_or_b32 s58, s58, s59
; SI-NEXT: s_or_b32 s56, s56, s57
; SI-NEXT: s_or_b32 s46, s46, s47
@@ -244315,13 +244314,12 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_or_b32 s42, s42, s43
; SI-NEXT: s_or_b32 s40, s40, s41
; SI-NEXT: s_add_i32 s6, s6, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s18, 33
-; SI-NEXT: s_and_b32 s31, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s16, s16, 16
+; SI-NEXT: v_writelane_b32 v41, s18, 32
+; SI-NEXT: s_lshl_b32 s18, s20, 16
; SI-NEXT: s_add_i32 s4, s4, 0x30000
; SI-NEXT: s_add_i32 s5, s5, 0x30000
-; SI-NEXT: s_add_i32 s76, s76, 0x30000
-; SI-NEXT: s_add_i32 s75, s75, 0x30000
+; SI-NEXT: s_add_i32 s79, s79, 0x30000
+; SI-NEXT: s_add_i32 s78, s78, 0x30000
; SI-NEXT: s_add_i32 s58, s58, 0x30000
; SI-NEXT: s_add_i32 s56, s56, 0x30000
; SI-NEXT: s_add_i32 s46, s46, 0x30000
@@ -244332,293 +244330,296 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_add_i32 s12, s12, 0x30000
; SI-NEXT: s_add_i32 s10, s10, 0x30000
; SI-NEXT: s_add_i32 s8, s8, 0x30000
+; SI-NEXT: s_add_i32 s7, s7, 0x30000
; SI-NEXT: s_add_i32 s9, s9, 0x30000
; SI-NEXT: s_add_i32 s11, s11, 0x30000
; SI-NEXT: s_add_i32 s13, s13, 0x30000
; SI-NEXT: s_add_i32 s15, s15, 0x30000
-; SI-NEXT: s_add_i32 s20, s20, 0x30000
+; SI-NEXT: s_add_i32 s19, s19, 0x30000
+; SI-NEXT: s_add_i32 s16, s16, 0x30000
; SI-NEXT: s_add_i32 s17, s17, 0x30000
-; SI-NEXT: s_add_i32 s7, s7, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s16, 34
-; SI-NEXT: s_and_b32 s34, s6, 0xffff0000
+; SI-NEXT: v_writelane_b32 v41, s18, 33
+; SI-NEXT: s_and_b32 s30, s6, 0xffff0000
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_and_b32 s30, s19, 0xffff0000
-; SI-NEXT: v_writelane_b32 v41, s6, 35
-; SI-NEXT: s_and_b32 s35, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s6, s7, 16
+; SI-NEXT: s_lshl_b32 s92, s27, 16
+; SI-NEXT: s_and_b32 s35, s20, 0xffff0000
; SI-NEXT: s_and_b32 s36, s17, 0xffff0000
; SI-NEXT: s_lshl_b32 s17, s17, 16
-; SI-NEXT: s_and_b32 s37, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s22, s20, 16
-; SI-NEXT: s_and_b32 s38, s15, 0xffff0000
-; SI-NEXT: s_lshl_b32 s24, s15, 16
-; SI-NEXT: s_and_b32 s39, s13, 0xffff0000
-; SI-NEXT: s_lshl_b32 s28, s13, 16
-; SI-NEXT: s_and_b32 s48, s11, 0xffff0000
-; SI-NEXT: s_lshl_b32 s61, s11, 16
-; SI-NEXT: s_and_b32 s49, s9, 0xffff0000
-; SI-NEXT: s_lshl_b32 s89, s9, 16
-; SI-NEXT: s_and_b32 s50, s8, 0xffff0000
-; SI-NEXT: s_lshl_b32 s60, s8, 16
-; SI-NEXT: s_and_b32 s91, s10, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s10, 16
-; SI-NEXT: s_and_b32 s51, s12, 0xffff0000
-; SI-NEXT: s_lshl_b32 s70, s12, 16
-; SI-NEXT: s_and_b32 s52, s14, 0xffff0000
-; SI-NEXT: s_lshl_b32 s71, s14, 16
-; SI-NEXT: s_and_b32 s53, s40, 0xffff0000
-; SI-NEXT: s_lshl_b32 s20, s40, 16
-; SI-NEXT: s_and_b32 s54, s42, 0xffff0000
-; SI-NEXT: s_lshl_b32 s81, s42, 16
-; SI-NEXT: s_and_b32 s55, s44, 0xffff0000
-; SI-NEXT: s_lshl_b32 s63, s44, 16
-; SI-NEXT: s_and_b32 s64, s46, 0xffff0000
-; SI-NEXT: s_lshl_b32 s72, s46, 16
-; SI-NEXT: s_and_b32 s65, s56, 0xffff0000
-; SI-NEXT: s_lshl_b32 s82, s56, 16
-; SI-NEXT: s_and_b32 s66, s58, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s58, 16
-; SI-NEXT: s_and_b32 s67, s75, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s75, 16
-; SI-NEXT: s_and_b32 s68, s76, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s76, 16
-; SI-NEXT: s_and_b32 s69, s5, 0xffff0000
-; SI-NEXT: s_lshl_b32 s85, s5, 16
-; SI-NEXT: s_and_b32 s26, s4, 0xffff0000
-; SI-NEXT: s_lshl_b32 s5, s4, 16
-; SI-NEXT: v_writelane_b32 v41, s6, 36
+; SI-NEXT: v_writelane_b32 v41, s6, 34
+; SI-NEXT: s_and_b32 s37, s16, 0xffff0000
+; SI-NEXT: s_lshl_b32 s6, s16, 16
+; SI-NEXT: s_and_b32 s38, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s19, 16
+; SI-NEXT: s_and_b32 s39, s15, 0xffff0000
+; SI-NEXT: s_lshl_b32 s20, s15, 16
+; SI-NEXT: s_and_b32 s48, s13, 0xffff0000
+; SI-NEXT: s_lshl_b32 s22, s13, 16
+; SI-NEXT: s_and_b32 s49, s11, 0xffff0000
+; SI-NEXT: s_lshl_b32 s60, s11, 16
+; SI-NEXT: s_and_b32 s50, s9, 0xffff0000
+; SI-NEXT: s_lshl_b32 s24, s9, 16
+; SI-NEXT: s_and_b32 s51, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s61, s7, 16
+; SI-NEXT: s_and_b32 s52, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s28, s8, 16
+; SI-NEXT: s_and_b32 s53, s10, 0xffff0000
+; SI-NEXT: s_lshl_b32 s72, s10, 16
+; SI-NEXT: s_and_b32 s54, s12, 0xffff0000
+; SI-NEXT: s_lshl_b32 s74, s12, 16
+; SI-NEXT: s_and_b32 s55, s14, 0xffff0000
+; SI-NEXT: s_lshl_b32 s95, s14, 16
+; SI-NEXT: s_and_b32 s64, s40, 0xffff0000
+; SI-NEXT: s_lshl_b32 s81, s40, 16
+; SI-NEXT: s_and_b32 s65, s42, 0xffff0000
+; SI-NEXT: s_lshl_b32 s82, s42, 16
+; SI-NEXT: s_and_b32 s66, s44, 0xffff0000
+; SI-NEXT: s_lshl_b32 s85, s44, 16
+; SI-NEXT: s_and_b32 s67, s46, 0xffff0000
+; SI-NEXT: s_lshl_b32 s86, s46, 16
+; SI-NEXT: s_and_b32 s68, s56, 0xffff0000
+; SI-NEXT: s_lshl_b32 s76, s56, 16
+; SI-NEXT: s_and_b32 s69, s58, 0xffff0000
+; SI-NEXT: s_lshl_b32 s97, s58, 16
+; SI-NEXT: s_and_b32 s70, s78, 0xffff0000
+; SI-NEXT: s_lshl_b32 s78, s78, 16
+; SI-NEXT: s_and_b32 s71, s79, 0xffff0000
+; SI-NEXT: s_lshl_b32 s79, s79, 16
+; SI-NEXT: s_and_b32 s90, s5, 0xffff0000
+; SI-NEXT: s_lshl_b32 s26, s5, 16
+; SI-NEXT: s_and_b32 s80, s4, 0xffff0000
+; SI-NEXT: s_lshl_b32 s4, s4, 16
+; SI-NEXT: v_writelane_b32 v41, s6, 35
; SI-NEXT: .LBB107_5: ; %end
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86
-; SI-NEXT: v_readlane_b32 s4, v41, 25
+; SI-NEXT: v_readlane_b32 s5, v41, 25
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96
-; SI-NEXT: v_readlane_b32 s4, v41, 26
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99
+; SI-NEXT: v_readlane_b32 s5, v41, 26
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99
-; SI-NEXT: v_readlane_b32 s4, v41, 27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63
+; SI-NEXT: v_readlane_b32 s5, v41, 27
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97
-; SI-NEXT: v_readlane_b32 s4, v41, 28
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98
+; SI-NEXT: v_readlane_b32 s5, v41, 28
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92
-; SI-NEXT: v_readlane_b32 s4, v41, 29
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT: v_readlane_b32 s5, v41, 29
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94
-; SI-NEXT: v_readlane_b32 s4, v41, 30
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96
+; SI-NEXT: v_readlane_b32 s5, v41, 30
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT: v_readlane_b32 s4, v41, 31
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31
+; SI-NEXT: v_readlane_b32 s5, v41, 31
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT: v_readlane_b32 s4, v41, 32
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT: v_readlane_b32 s5, v41, 32
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30
-; SI-NEXT: v_readlane_b32 s4, v41, 33
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT: v_readlane_b32 s5, v41, 33
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT: v_readlane_b32 s4, v41, 34
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34
-; SI-NEXT: v_readlane_b32 s4, v41, 35
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT: v_readlane_b32 s5, v41, 34
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT: v_readlane_b32 s4, v41, 36
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
+; SI-NEXT: v_readlane_b32 s5, v41, 35
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s95
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -245123,94 +245124,94 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v30
; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v40, v8
; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
; SI-NEXT: v_cvt_f16_f32_e32 v53, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v20, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v17
; SI-NEXT: v_cvt_f16_f32_e32 v18, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v20, v24
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v35
; SI-NEXT: v_cvt_f16_f32_e32 v31, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v50
; SI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v42
; SI-NEXT: v_cvt_f16_f32_e32 v21, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v49
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v50
; SI-NEXT: v_cvt_f16_f32_e32 v29, v51
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v54
; SI-NEXT: v_cvt_f16_f32_e32 v16, v41
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v58
; SI-NEXT: v_cvt_f16_f32_e32 v17, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v56
; SI-NEXT: v_cvt_f16_f32_e32 v11, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v62
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v59
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v60
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v37
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v32
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v33
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v36
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v37
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:104
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v63
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v62
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cvt_f16_f32_e32 v45, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v45, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v39
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v14, v6
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -245219,15 +245220,15 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v41, v8
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v10
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v32
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v37
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v32
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v46
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v46, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v6
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -245240,17 +245241,18 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_mov_b32_e32 v59, v29
; SI-NEXT: v_mov_b32_e32 v29, v27
; SI-NEXT: v_mov_b32_e32 v57, v23
-; SI-NEXT: v_mov_b32_e32 v60, v3
-; SI-NEXT: v_mov_b32_e32 v62, v4
-; SI-NEXT: v_mov_b32_e32 v63, v49
-; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v60, v4
+; SI-NEXT: v_mov_b32_e32 v62, v13
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_2
; SI-NEXT: ; %bb.1: ; %cmp.true
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v61
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
@@ -245280,17 +245282,20 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v14, v14, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36
; SI-NEXT: v_or_b32_e32 v33, v33, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
@@ -245300,111 +245305,106 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v61
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_or_b32_e32 v56, v37, v39
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v12
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT: v_or_b32_e32 v11, v11, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_or_b32_e32 v61, v3, v37
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58
-; SI-NEXT: v_or_b32_e32 v11, v11, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17
; SI-NEXT: v_or_b32_e32 v16, v16, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_or_b32_e32 v21, v21, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_or_b32_e32 v24, v24, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v31, v31, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20
; SI-NEXT: v_or_b32_e32 v19, v19, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1
; SI-NEXT: v_or_b32_e32 v18, v18, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v37
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v37
; SI-NEXT: v_cvt_f32_f16_e32 v37, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v53, v37
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_or_b32_e32 v52, v37, v39
; SI-NEXT: v_cvt_f32_f16_e32 v37, v40
; SI-NEXT: v_cvt_f32_f16_e32 v39, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v40, v37
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_or_b32_e32 v55, v37, v39
; SI-NEXT: v_cvt_f32_f16_e32 v37, v44
; SI-NEXT: v_cvt_f32_f16_e32 v39, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v44, v37
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_or_b32_e32 v43, v37, v39
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v63
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v47, v47
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v47
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
@@ -245412,27 +245412,22 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_cvt_f32_f16_e32 v45, v45
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_cvt_f32_f16_e32 v46, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
+; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v47
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
@@ -245443,125 +245438,131 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: v_cvt_f16_f32_e32 v45, v45
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT: v_or_b32_e32 v38, v38, v47
; SI-NEXT: v_or_b32_e32 v54, v54, v42
+; SI-NEXT: v_or_b32_e32 v49, v49, v51
; SI-NEXT: v_or_b32_e32 v45, v45, v50
; SI-NEXT: v_or_b32_e32 v41, v41, v30
; SI-NEXT: v_or_b32_e32 v46, v46, v32
-; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16
; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16
+; SI-NEXT: v_alignbit_b32 v51, v56, v51, 16
; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16
; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16
; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v3
-; SI-NEXT: v_or_b32_e32 v3, v37, v34
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v4
+; SI-NEXT: v_or_b32_e32 v4, v37, v34
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v39
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16
-; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_alignbit_b32 v63, v55, v37, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v3, v39, v1
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v4, v48, v37
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v3
-; SI-NEXT: v_or_b32_e32 v3, v37, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v4
+; SI-NEXT: v_or_b32_e32 v4, v39, v5
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v3, v39, v9
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v62
+; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v4
+; SI-NEXT: v_or_b32_e32 v4, v39, v9
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT: v_or_b32_e32 v62, v56, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v60
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v62, v48, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v60
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; SI-NEXT: v_or_b32_e32 v60, v56, v39
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
-; SI-NEXT: v_or_b32_e32 v57, v56, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v59
+; SI-NEXT: v_or_b32_e32 v60, v48, v39
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT: v_or_b32_e32 v57, v48, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v59
; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_or_b32_e32 v29, v29, v23
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v4
+; SI-NEXT: v_alignbit_b32 v4, v18, v13, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v19, v39, 16
; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_or_b32_e32 v59, v56, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v63
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
-; SI-NEXT: v_or_b32_e32 v63, v56, v35
-; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v3
-; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT: v_or_b32_e32 v3, v49, v51
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16
-; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v4, v24, v23, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v59, v48, v27
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v58
+; SI-NEXT: v_or_b32_e32 v38, v38, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16
-; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v4, v21, v27, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v47
+; SI-NEXT: v_or_b32_e32 v58, v47, v35
+; SI-NEXT: v_alignbit_b32 v47, v16, v48, 16
+; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16
; SI-NEXT: .LBB108_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v37, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v37, 0xffff, v4
; SI-NEXT: v_or_b32_e32 v34, v37, v34
; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -245570,58 +245571,52 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v34, v34, v37
; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0
; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v63
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; SI-NEXT: v_or_b32_e32 v1, v34, v1
-; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40
-; SI-NEXT: v_or_b32_e32 v1, v1, v34
-; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v34, v34, v37
+; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53
-; SI-NEXT: v_or_b32_e32 v1, v1, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40
+; SI-NEXT: v_or_b32_e32 v34, v34, v37
+; SI-NEXT: v_add_i32_e32 v37, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v5, v34, v5
+; SI-NEXT: v_add_i32_e32 v34, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v52
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53
+; SI-NEXT: v_or_b32_e32 v5, v5, v34
+; SI-NEXT: v_add_i32_e32 v34, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v5, v34, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v5, v5, v9
+; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v62
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -245647,7 +245642,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -245661,7 +245656,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -245695,28 +245690,24 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
@@ -246071,29 +246062,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v49, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
; SI-NEXT: v_cvt_f16_f32_e32 v50, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v9
; SI-NEXT: v_cvt_f16_f32_e32 v39, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v13
; SI-NEXT: v_cvt_f16_f32_e32 v36, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v16
; SI-NEXT: v_cvt_f16_f32_e32 v32, v17
; SI-NEXT: v_cvt_f16_f32_e32 v33, v18
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_cvt_f16_f32_e32 v17, v24
@@ -246112,19 +246101,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v31, v21
; SI-NEXT: v_cvt_f16_f32_e32 v27, v46
; SI-NEXT: v_cvt_f16_f32_e32 v11, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
; SI-NEXT: v_cvt_f16_f32_e32 v12, v56
; SI-NEXT: v_cvt_f16_f32_e32 v26, v58
; SI-NEXT: v_cvt_f16_f32_e32 v8, v59
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
; SI-NEXT: v_cvt_f16_f32_e32 v9, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v61
; SI-NEXT: v_cvt_f16_f32_e32 v24, v62
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v63
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v63
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v6, v34
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v34, v37
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v21, v48
@@ -246304,9 +246296,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v45
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_or_b32_e32 v60, v48, v4
; SI-NEXT: v_cvt_f32_f16_e32 v48, v62
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
@@ -246314,153 +246306,153 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v54, v56
; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48
; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
+; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48
-; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_or_b32_e32 v56, v54, v48
; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55
; SI-NEXT: v_cvt_f32_f16_e32 v55, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: v_or_b32_e32 v45, v40, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v7
-; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT: v_or_b32_e32 v7, v41, v55
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; SI-NEXT: v_or_b32_e32 v41, v41, v55
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
+; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v45
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: v_or_b32_e32 v45, v40, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v7, v7, v40
; SI-NEXT: v_or_b32_e32 v14, v14, v13
; SI-NEXT: v_or_b32_e32 v23, v23, v17
; SI-NEXT: v_or_b32_e32 v34, v34, v21
; SI-NEXT: v_alignbit_b32 v4, v57, v4, 16
-; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16
; SI-NEXT: v_alignbit_b32 v62, v29, v48, 16
; SI-NEXT: v_alignbit_b32 v61, v52, v54, 16
; SI-NEXT: v_alignbit_b32 v44, v49, v55, 16
+; SI-NEXT: v_alignbit_b32 v43, v38, v40, 16
; SI-NEXT: v_alignbit_b32 v13, v32, v13, 16
; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
; SI-NEXT: v_alignbit_b32 v21, v2, v21, 16
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: v_or_b32_e32 v7, v7, v40
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v7, v41, v10
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v41, v41, v10
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: v_or_b32_e32 v7, v41, v20
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v41, v41, v20
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v20, v31, v20, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: v_or_b32_e32 v7, v41, v28
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v41, v41, v28
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v7
-; SI-NEXT: v_or_b32_e32 v7, v41, v27
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16
+; SI-NEXT: v_or_b32_e32 v41, v41, v27
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v42
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v43
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v7, v41, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16
+; SI-NEXT: v_or_b32_e32 v41, v41, v26
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: v_or_b32_e32 v43, v42, v24
-; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v7, v41, v37
-; SI-NEXT: v_mov_b32_e32 v51, v7
-; SI-NEXT: v_alignbit_b32 v7, v38, v40, 16
+; SI-NEXT: v_or_b32_e32 v41, v42, v24
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
+; SI-NEXT: v_or_b32_e32 v1, v41, v37
+; SI-NEXT: v_mov_b32_e32 v51, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v37, v1, v37, 16
; SI-NEXT: .LBB109_3: ; %end
; SI-NEXT: v_and_b32_e32 v48, 0xffff, v60
@@ -246512,7 +246504,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
@@ -246530,9 +246521,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v29, vcc, 36, v0
; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v43
; SI-NEXT: v_or_b32_e32 v4, v4, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
@@ -246543,7 +246533,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -246569,7 +246559,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -246595,7 +246585,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -246609,7 +246599,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -246623,7 +246613,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -246637,8 +246627,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: v_or_b32_e32 v4, v4, v7
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
@@ -248341,10 +248333,9 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
@@ -248364,13 +248355,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
; SI-NEXT: v_mov_b32_e32 v42, v4
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
@@ -248381,15 +248372,14 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
@@ -248398,35 +248388,35 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
; SI-NEXT: v_cvt_f32_f16_e32 v41, s16
; SI-NEXT: v_cvt_f32_f16_e32 v44, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v43, s29
+; SI-NEXT: v_cvt_f32_f16_e32 v45, s27
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v2
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, s17
-; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v43, s29
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v3
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v59, s27
+; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v42
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, s19
-; SI-NEXT: v_mov_b32_e32 v2, v9
+; SI-NEXT: v_mov_b32_e32 v2, v5
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, s20
-; SI-NEXT: v_mov_b32_e32 v3, v10
+; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v6
@@ -248461,56 +248451,56 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, s25
-; SI-NEXT: v_mov_b32_e32 v60, v29
+; SI-NEXT: v_mov_b32_e32 v62, v28
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v11
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v41, s28
-; SI-NEXT: v_mov_b32_e32 v61, v30
+; SI-NEXT: v_mov_b32_e32 v63, v29
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v12
+; SI-NEXT: v_mov_b32_e32 v60, v30
+; SI-NEXT: v_mov_b32_e32 v61, v32
; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v62
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v13
; SI-NEXT: v_cvt_f32_f16_e32 v13, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT: v_mov_b32_e32 v32, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v46
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v14
; SI-NEXT: v_cvt_f32_f16_e32 v14, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v53
+; SI-NEXT: v_mov_b32_e32 v31, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v59
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v15
; SI-NEXT: v_cvt_f32_f16_e32 v15, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v16
; SI-NEXT: v_cvt_f32_f16_e32 v16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v53
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v17
; SI-NEXT: v_cvt_f32_f16_e32 v17, v25
; SI-NEXT: v_mov_b32_e32 v25, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v52
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v18
; SI-NEXT: v_cvt_f32_f16_e32 v18, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v50
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v19
@@ -248521,11 +248511,16 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v20
; SI-NEXT: v_cvt_f32_f16_e32 v20, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_mov_b32_e32 v26, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v48
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: s_branch .LBB111_3
@@ -248534,33 +248529,53 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: v_mov_b32_e32 v61, v30
+; SI-NEXT: v_mov_b32_e32 v61, v32
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: v_mov_b32_e32 v60, v29
+; SI-NEXT: v_mov_b32_e32 v32, v31
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: v_mov_b32_e32 v3, v10
+; SI-NEXT: v_mov_b32_e32 v31, v46
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: v_mov_b32_e32 v2, v9
+; SI-NEXT: v_mov_b32_e32 v60, v30
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; kill: killed $vgpr41
+; SI-NEXT: v_mov_b32_e32 v63, v29
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; kill: killed $vgpr41
+; SI-NEXT: v_mov_b32_e32 v62, v28
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; kill: killed $vgpr41
+; SI-NEXT: v_mov_b32_e32 v3, v7
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; kill: killed $vgpr41
+; SI-NEXT: v_mov_b32_e32 v2, v5
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; kill: killed $vgpr41
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; kill: killed $vgpr41
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr1
@@ -248597,22 +248612,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: ; kill: killed $vgpr41
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; kill: killed $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr4
@@ -248649,10 +248648,10 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v19, v1
; SI-NEXT: s_cbranch_vccnz .LBB111_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v3
-; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v2
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v3
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v2
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_cvt_f32_f16_e32 v41, s16
; SI-NEXT: s_add_i32 s17, s17, 3
@@ -248707,16 +248706,17 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54
; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55
; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40
-; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v62
-; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v63
-; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
-; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v61
-; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v60
-; SI-NEXT: s_add_i32 s29, s29, 3
+; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v59
+; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v31
+; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32
+; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v61
+; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v60
+; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v63
+; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v62
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_add_i32 s29, s29, 3
; SI-NEXT: s_add_i32 s28, s28, 3
; SI-NEXT: s_add_i32 s27, s27, 3
; SI-NEXT: s_add_i32 s26, s26, 3
@@ -248732,75 +248732,72 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v4
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
@@ -248810,8 +248807,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v4
; SI-NEXT: v_cvt_f32_f16_e32 v4, v49
@@ -248827,7 +248822,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v7
; SI-NEXT: v_cvt_f32_f16_e32 v7, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v59, s27
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
@@ -248885,6 +248879,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v20
; SI-NEXT: v_cvt_f32_f16_e32 v20, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v26, s27
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v34
@@ -248945,7 +248940,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v26
; SI-NEXT: v_cvt_f16_f32_e32 v2, v25
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 1056c66720b25..f219402aa5ebf 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -3743,7 +3743,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s16, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24
@@ -3754,11 +3754,11 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s16
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3
; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3
@@ -3775,12 +3775,12 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-TRUE16-NEXT: .LBB25_3: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s15
@@ -3796,8 +3796,8 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB25_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16
@@ -3814,7 +3814,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s17, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24
@@ -3826,10 +3826,10 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s17
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3
; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3
@@ -3847,10 +3847,10 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-FAKE16-NEXT: .LBB25_3: ; %end
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s18
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13
@@ -3860,7 +3860,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB25_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
@@ -12472,7 +12472,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s16, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24
@@ -12483,11 +12483,11 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s16
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3
; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3
@@ -12504,12 +12504,12 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-TRUE16-NEXT: .LBB69_3: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s15
@@ -12525,8 +12525,8 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB69_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16
@@ -12543,7 +12543,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s17, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24
@@ -12555,10 +12555,10 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s17
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3
; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3
@@ -12576,10 +12576,10 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8
; GFX11-FAKE16-NEXT: .LBB69_3: ; %end
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s18
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13
@@ -12589,7 +12589,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB69_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
@@ -19198,78 +19198,74 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) {
; VI-LABEL: bitcast_v8i16_to_v16i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v19, v3
-; VI-NEXT: v_mov_b32_e32 v18, v2
+; VI-NEXT: v_mov_b32_e32 v17, v3
+; VI-NEXT: v_mov_b32_e32 v16, v2
+; VI-NEXT: v_mov_b32_e32 v19, v1
+; VI-NEXT: v_mov_b32_e32 v18, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT: ; implicit-def: $vgpr16
-; VI-NEXT: ; implicit-def: $vgpr20
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; implicit-def: $vgpr5
; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: ; implicit-def: $vgpr8
; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: ; implicit-def: $vgpr21
+; VI-NEXT: ; implicit-def: $vgpr11
; VI-NEXT: ; implicit-def: $vgpr13
; VI-NEXT: ; implicit-def: $vgpr15
-; VI-NEXT: ; implicit-def: $vgpr11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB96_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v19
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[18:19]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v16, v0
-; VI-NEXT: v_mov_b32_e32 v17, v1
-; VI-NEXT: v_mov_b32_e32 v8, v18
-; VI-NEXT: v_mov_b32_e32 v21, v19
-; VI-NEXT: ; implicit-def: $vgpr1
-; VI-NEXT: ; implicit-def: $vgpr19
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17]
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19]
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
; VI-NEXT: .LBB96_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB96_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v3, 3
-; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v14, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v10, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v17, 3, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; VI-NEXT: v_add_u16_e32 v16, 3, v0
+; VI-NEXT: v_add_u16_sdwa v6, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v20, 3, v19
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; VI-NEXT: v_add_u16_sdwa v2, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v14, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v10, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v1, v20, v0
+; VI-NEXT: v_add_u16_e32 v19, 3, v18
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT: v_add_u16_e32 v21, 3, v19
+; VI-NEXT: v_add_u16_e32 v21, 3, v17
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14
-; VI-NEXT: v_add_u16_e32 v8, 3, v18
+; VI-NEXT: v_add_u16_e32 v17, 3, v16
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10
-; VI-NEXT: v_or_b32_e32 v1, v17, v1
-; VI-NEXT: v_or_b32_e32 v0, v16, v0
-; VI-NEXT: v_or_b32_e32 v19, v21, v4
-; VI-NEXT: v_or_b32_e32 v18, v8, v3
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[18:19]
+; VI-NEXT: v_or_b32_e32 v0, v19, v0
+; VI-NEXT: v_or_b32_e32 v8, v21, v4
+; VI-NEXT: v_or_b32_e32 v7, v17, v3
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8]
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; VI-NEXT: v_bfe_u32 v15, v14, 8, 8
; VI-NEXT: v_bfe_u32 v7, v6, 8, 8
+; VI-NEXT: v_mov_b32_e32 v18, v19
+; VI-NEXT: v_mov_b32_e32 v19, v20
+; VI-NEXT: v_mov_b32_e32 v16, v17
+; VI-NEXT: v_mov_b32_e32 v17, v21
; VI-NEXT: .LBB96_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, v16
-; VI-NEXT: v_mov_b32_e32 v1, v20
-; VI-NEXT: v_mov_b32_e32 v4, v17
-; VI-NEXT: v_mov_b32_e32 v12, v21
+; VI-NEXT: v_mov_b32_e32 v0, v18
+; VI-NEXT: v_mov_b32_e32 v4, v19
+; VI-NEXT: v_mov_b32_e32 v8, v16
+; VI-NEXT: v_mov_b32_e32 v12, v17
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v8i16_to_v16i8:
@@ -24323,13 +24319,13 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
@@ -24413,6 +24409,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l
@@ -25860,9 +25857,9 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_readfirstlane_b32 s9, v1
+; SI-NEXT: v_readfirstlane_b32 s8, v1
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s11, v0
+; SI-NEXT: v_readfirstlane_b32 s9, v0
; SI-NEXT: s_cbranch_scc0 .LBB111_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s16, 0xff
@@ -25876,11 +25873,11 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s4, s20, 0xff
; SI-NEXT: s_lshl_b32 s5, s21, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_lshl_b32 s8, s4, 16
+; SI-NEXT: s_lshl_b32 s10, s4, 16
; SI-NEXT: s_and_b32 s4, s22, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s23, 24
-; SI-NEXT: s_or_b32 s10, s5, s4
+; SI-NEXT: s_or_b32 s11, s5, s4
; SI-NEXT: s_and_b32 s4, s24, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s25, 24
@@ -25893,20 +25890,20 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
; SI-NEXT: s_lshl_b32 s5, s29, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_lshl_b32 s14, s4, 16
-; SI-NEXT: s_and_b32 s4, s11, 0xff
+; SI-NEXT: s_and_b32 s4, s9, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s9, 24
+; SI-NEXT: s_lshl_b32 s5, s8, 24
; SI-NEXT: s_or_b32 s15, s5, s4
; SI-NEXT: s_cbranch_execnz .LBB111_3
; SI-NEXT: .LBB111_2: ; %cmp.true
; SI-NEXT: s_add_i32 s28, s28, 3
; SI-NEXT: s_and_b32 s4, s28, 0xff
; SI-NEXT: s_lshl_b32 s5, s29, 8
-; SI-NEXT: s_add_i32 s11, s11, 3
+; SI-NEXT: s_add_i32 s9, s9, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_and_b32 s6, s11, 0xff
+; SI-NEXT: s_and_b32 s6, s9, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s9, 24
+; SI-NEXT: s_lshl_b32 s5, s8, 24
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
@@ -25953,8 +25950,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
; SI-NEXT: s_add_i32 s6, s6, 0x3000000
; SI-NEXT: s_and_b32 s7, s6, 0xffff0000
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_and_b32 s10, s8, 0xffff0000
-; SI-NEXT: s_lshl_b32 s8, s8, 16
+; SI-NEXT: s_and_b32 s11, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s10, s8, 16
; SI-NEXT: s_and_b32 s13, s5, 0xffff0000
; SI-NEXT: s_lshl_b32 s12, s5, 16
; SI-NEXT: s_and_b32 s15, s4, 0xffff0000
@@ -25962,8 +25959,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
; SI-NEXT: .LBB111_3: ; %end
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s7
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_mov_b32_e32 v3, s10
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: v_mov_b32_e32 v4, s12
; SI-NEXT: v_mov_b32_e32 v5, s13
; SI-NEXT: v_mov_b32_e32 v6, s14
@@ -25972,8 +25969,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
; SI-NEXT: .LBB111_4:
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; implicit-def: $sgpr7
-; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; implicit-def: $sgpr10
+; SI-NEXT: ; implicit-def: $sgpr11
; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr14
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 8dc00701dcfd6..12c98939a2368 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -5622,7 +5622,7 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24
@@ -5642,14 +5642,14 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s43
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3
; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3
@@ -5681,16 +5681,16 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-TRUE16-NEXT: .LBB25_3: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2
@@ -5719,10 +5719,10 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB25_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
@@ -5749,7 +5749,7 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s44, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24
@@ -5770,13 +5770,13 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3
; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3
@@ -5809,12 +5809,12 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-FAKE16-NEXT: .LBB25_3: ; %end
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40
@@ -5831,8 +5831,8 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB25_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
@@ -19875,7 +19875,7 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24
@@ -19895,14 +19895,14 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s43
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3
; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3
@@ -19934,16 +19934,16 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-TRUE16-NEXT: .LBB69_3: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2
@@ -19972,10 +19972,10 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB69_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
@@ -20002,7 +20002,7 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s44, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24
@@ -20023,13 +20023,13 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s44
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3
; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3
@@ -20062,12 +20062,12 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8
; GFX11-FAKE16-NEXT: .LBB69_3: ; %end
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40
@@ -20084,8 +20084,8 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB69_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
@@ -31362,36 +31362,32 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
; VI-LABEL: bitcast_v16i16_to_v32i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v33, v5
+; VI-NEXT: v_mov_b32_e32 v32, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: v_mov_b32_e32 v34, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0
-; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr5
; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr8
; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: ; implicit-def: $vgpr35
; VI-NEXT: ; implicit-def: $vgpr13
; VI-NEXT: ; implicit-def: $vgpr15
-; VI-NEXT: ; implicit-def: $vgpr16
; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: ; implicit-def: $vgpr21
; VI-NEXT: ; implicit-def: $vgpr23
-; VI-NEXT: ; implicit-def: $vgpr24
; VI-NEXT: ; implicit-def: $vgpr25
-; VI-NEXT: ; implicit-def: $vgpr51
; VI-NEXT: ; implicit-def: $vgpr29
; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: ; implicit-def: $vgpr3
; VI-NEXT: ; implicit-def: $vgpr11
; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: ; implicit-def: $vgpr27
@@ -31402,97 +31398,92 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7
; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7
; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34
+; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v1
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v50, v0
-; VI-NEXT: v_mov_b32_e32 v48, v1
-; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_mov_b32_e32 v35, v3
-; VI-NEXT: v_mov_b32_e32 v16, v4
-; VI-NEXT: v_mov_b32_e32 v49, v5
-; VI-NEXT: v_mov_b32_e32 v24, v6
-; VI-NEXT: v_mov_b32_e32 v51, v7
-; VI-NEXT: ; implicit-def: $vgpr1
-; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: ; implicit-def: $vgpr7
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33]
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35]
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0
; VI-NEXT: .LBB96_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB96_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_mov_b32_e32 v9, 3
-; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v30, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v26, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v48, 3, v1
+; VI-NEXT: v_mov_b32_e32 v13, 3
+; VI-NEXT: v_add_u16_sdwa v2, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v15, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; VI-NEXT: v_add_u16_sdwa v14, v35, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, v15, v0
+; VI-NEXT: v_add_u16_e32 v16, 3, v35
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v14
+; VI-NEXT: v_add_u16_sdwa v10, v34, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v12, v16, v0
+; VI-NEXT: v_add_u16_e32 v17, 3, v34
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10
+; VI-NEXT: v_add_u16_sdwa v22, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v11, v17, v0
+; VI-NEXT: v_add_u16_e32 v19, 3, v33
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22
+; VI-NEXT: v_add_u16_sdwa v18, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v36, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v9, v19, v0
+; VI-NEXT: v_add_u16_e32 v20, 3, v32
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18
+; VI-NEXT: v_add_u16_sdwa v30, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v5, 3, v1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36
-; VI-NEXT: v_add_u16_e32 v50, 3, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32
-; VI-NEXT: v_add_u16_e32 v35, 3, v3
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14
-; VI-NEXT: v_add_u16_e32 v8, 3, v2
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; VI-NEXT: v_add_u16_e32 v49, 3, v5
-; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22
-; VI-NEXT: v_add_u16_e32 v16, 3, v4
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18
-; VI-NEXT: v_add_u16_e32 v51, 3, v7
-; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v30
-; VI-NEXT: v_add_u16_e32 v24, 3, v6
-; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v26
-; VI-NEXT: v_or_b32_e32 v1, v48, v1
-; VI-NEXT: v_or_b32_e32 v0, v50, v0
-; VI-NEXT: v_or_b32_e32 v3, v35, v3
-; VI-NEXT: v_or_b32_e32 v2, v8, v2
-; VI-NEXT: v_or_b32_e32 v5, v49, v5
-; VI-NEXT: v_or_b32_e32 v4, v16, v4
-; VI-NEXT: v_or_b32_e32 v7, v51, v7
-; VI-NEXT: v_or_b32_e32 v6, v24, v6
-; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0
+; VI-NEXT: v_or_b32_e32 v8, v20, v0
+; VI-NEXT: v_add_u16_e32 v23, 3, v7
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30
+; VI-NEXT: v_add_u16_sdwa v26, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v4, v5, v1
+; VI-NEXT: v_or_b32_e32 v1, v23, v0
+; VI-NEXT: v_add_u16_e32 v7, 3, v6
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26
+; VI-NEXT: v_or_b32_e32 v0, v7, v0
+; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[0:1]
+; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9
+; VI-NEXT: v_mov_b32_e32 v1, v5
+; VI-NEXT: v_mov_b32_e32 v32, v20
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9]
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4]
+; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v0
+; VI-NEXT: v_mov_b32_e32 v0, v15
+; VI-NEXT: v_mov_b32_e32 v34, v17
+; VI-NEXT: v_mov_b32_e32 v35, v16
+; VI-NEXT: v_mov_b32_e32 v6, v7
+; VI-NEXT: v_mov_b32_e32 v7, v23
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8
; VI-NEXT: v_bfe_u32 v31, v30, 8, 8
; VI-NEXT: v_bfe_u32 v23, v22, 8, 8
; VI-NEXT: v_bfe_u32 v15, v14, 8, 8
-; VI-NEXT: v_bfe_u32 v39, v36, 8, 8
+; VI-NEXT: v_bfe_u32 v37, v36, 8, 8
; VI-NEXT: .LBB96_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, v50
+; VI-NEXT: v_mov_b32_e32 v4, v1
+; VI-NEXT: v_mov_b32_e32 v8, v34
+; VI-NEXT: v_mov_b32_e32 v12, v35
+; VI-NEXT: v_mov_b32_e32 v16, v32
+; VI-NEXT: v_mov_b32_e32 v20, v33
+; VI-NEXT: v_mov_b32_e32 v24, v6
+; VI-NEXT: v_mov_b32_e32 v28, v7
; VI-NEXT: v_mov_b32_e32 v1, v38
-; VI-NEXT: v_mov_b32_e32 v2, v32
-; VI-NEXT: v_mov_b32_e32 v3, v33
-; VI-NEXT: v_mov_b32_e32 v4, v48
-; VI-NEXT: v_mov_b32_e32 v5, v37
; VI-NEXT: v_mov_b32_e32 v6, v36
-; VI-NEXT: v_mov_b32_e32 v7, v39
-; VI-NEXT: v_mov_b32_e32 v12, v35
-; VI-NEXT: v_mov_b32_e32 v20, v49
-; VI-NEXT: v_mov_b32_e32 v28, v51
+; VI-NEXT: v_mov_b32_e32 v7, v37
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v16i16_to_v32i8:
@@ -40362,19 +40353,19 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
@@ -40536,6 +40527,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v35.l
@@ -43074,10 +43066,10 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: v_readfirstlane_b32 s42, v15
; SI-NEXT: v_readfirstlane_b32 s43, v14
-; SI-NEXT: v_readfirstlane_b32 s40, v7
-; SI-NEXT: v_readfirstlane_b32 s41, v6
-; SI-NEXT: v_readfirstlane_b32 s10, v1
-; SI-NEXT: v_readfirstlane_b32 s9, v0
+; SI-NEXT: v_readfirstlane_b32 s13, v7
+; SI-NEXT: v_readfirstlane_b32 s15, v6
+; SI-NEXT: v_readfirstlane_b32 s7, v1
+; SI-NEXT: v_readfirstlane_b32 s6, v0
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9
@@ -43088,15 +43080,15 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: s_and_b32 s4, s16, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s17, 24
-; SI-NEXT: s_or_b32 s6, s5, s4
+; SI-NEXT: s_or_b32 s8, s5, s4
; SI-NEXT: s_and_b32 s4, s18, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s19, 24
-; SI-NEXT: s_or_b32 s7, s5, s4
+; SI-NEXT: s_or_b32 s9, s5, s4
; SI-NEXT: s_and_b32 s4, s20, 0xff
; SI-NEXT: s_lshl_b32 s5, s21, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_lshl_b32 s8, s4, 16
+; SI-NEXT: s_lshl_b32 s10, s4, 16
; SI-NEXT: s_and_b32 s4, s22, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s23, 24
@@ -43114,24 +43106,24 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s27, 24
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_or_b32 s13, s5, s4
+; SI-NEXT: s_or_b32 s14, s5, s4
; SI-NEXT: s_and_b32 s4, s28, 0xff
; SI-NEXT: s_lshl_b32 s5, s29, 8
; SI-NEXT: v_or_b32_e32 v9, v0, v7
; SI-NEXT: v_and_b32_e32 v7, 0xff, v8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_lshl_b32 s14, s4, 16
-; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s40, s4, 16
+; SI-NEXT: s_and_b32 s4, s6, 0xff
; SI-NEXT: v_or_b32_e32 v19, v1, v7
; SI-NEXT: v_and_b32_e32 v7, 0xff, v10
; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s5, s10, 24
+; SI-NEXT: s_lshl_b32 s5, s7, 24
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11
-; SI-NEXT: s_or_b32 s15, s5, s4
-; SI-NEXT: s_and_b32 s4, s41, 0xff
-; SI-NEXT: s_lshl_b32 s5, s40, 8
+; SI-NEXT: s_or_b32 s41, s5, s4
+; SI-NEXT: s_and_b32 s4, s15, 0xff
+; SI-NEXT: s_lshl_b32 s5, s13, 8
; SI-NEXT: v_or_b32_e32 v18, v13, v7
; SI-NEXT: v_and_b32_e32 v7, 0xff, v12
; SI-NEXT: s_or_b32 s4, s4, s5
@@ -43167,11 +43159,11 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v5, v5, v9
-; SI-NEXT: s_add_i32 s41, s41, 3
+; SI-NEXT: s_add_i32 s15, s15, 3
; SI-NEXT: v_or_b32_e32 v6, s4, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v7
-; SI-NEXT: s_and_b32 s4, s41, 0xff
-; SI-NEXT: s_lshl_b32 s5, s40, 8
+; SI-NEXT: s_and_b32 s4, s15, 0xff
+; SI-NEXT: s_lshl_b32 s5, s13, 8
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
@@ -43183,11 +43175,11 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: v_or_b32_e32 v1, s4, v1
; SI-NEXT: s_and_b32 s4, s28, 0xff
; SI-NEXT: s_lshl_b32 s5, s29, 8
-; SI-NEXT: s_add_i32 s9, s9, 3
+; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_and_b32 s6, s9, 0xff
+; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_addk_i32 s4, 0x300
-; SI-NEXT: s_lshl_b32 s5, s10, 24
+; SI-NEXT: s_lshl_b32 s5, s7, 24
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s5, s6
@@ -43210,36 +43202,35 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: s_add_i32 s22, s22, 3
; SI-NEXT: s_or_b32 s6, s7, s6
; SI-NEXT: s_and_b32 s8, s22, 0xff
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: s_addk_i32 s6, 0x300
; SI-NEXT: s_lshl_b32 s7, s23, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
-; SI-NEXT: s_and_b32 s6, s6, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s8
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_or_b32 s7, s7, s8
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4
-; SI-NEXT: s_add_i32 s8, s6, 0x3000000
-; SI-NEXT: s_and_b32 s6, s16, 0xff
-; SI-NEXT: s_lshl_b32 s7, s17, 8
+; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_and_b32 s7, s16, 0xff
+; SI-NEXT: s_lshl_b32 s8, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s7, s8, s7
; SI-NEXT: s_and_b32 s9, s18, 0xff
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_addk_i32 s6, 0x300
-; SI-NEXT: s_lshl_b32 s7, s19, 24
+; SI-NEXT: s_addk_i32 s7, 0x300
+; SI-NEXT: s_lshl_b32 s8, s19, 24
; SI-NEXT: s_lshl_b32 s9, s9, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v3
-; SI-NEXT: s_and_b32 s6, s6, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: v_or_b32_e32 v0, v0, v2
-; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s7, s8, s7
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6
; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1
@@ -43247,14 +43238,15 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: s_add_i32 s4, s4, 0x3000000
; SI-NEXT: s_add_i32 s5, s5, 0x3000000
; SI-NEXT: s_add_i32 s6, s6, 0x3000000
-; SI-NEXT: s_and_b32 s7, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_and_b32 s11, s8, 0xffff0000
-; SI-NEXT: s_lshl_b32 s8, s8, 16
-; SI-NEXT: s_and_b32 s13, s5, 0xffff0000
+; SI-NEXT: s_add_i32 s7, s7, 0x3000000
+; SI-NEXT: s_and_b32 s9, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s8, s7, 16
+; SI-NEXT: s_and_b32 s11, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s10, s6, 16
+; SI-NEXT: s_and_b32 s14, s5, 0xffff0000
; SI-NEXT: s_lshl_b32 s12, s5, 16
-; SI-NEXT: s_and_b32 s15, s4, 0xffff0000
-; SI-NEXT: s_lshl_b32 s14, s4, 16
+; SI-NEXT: s_and_b32 s41, s4, 0xffff0000
+; SI-NEXT: s_lshl_b32 s40, s4, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1
@@ -43265,14 +43257,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6
; SI-NEXT: s_branch .LBB111_5
; SI-NEXT: .LBB111_3:
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr7
; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr9
+; SI-NEXT: ; implicit-def: $sgpr10
; SI-NEXT: ; implicit-def: $sgpr11
; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr15
+; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr41
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $sgpr44
@@ -43286,14 +43278,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; SI-NEXT: v_mov_b32_e32 v10, s44
; SI-NEXT: v_mov_b32_e32 v14, s45
; SI-NEXT: .LBB111_5: ; %end
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
-; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: v_mov_b32_e32 v4, s12
-; SI-NEXT: v_mov_b32_e32 v5, s13
-; SI-NEXT: v_mov_b32_e32 v6, s14
-; SI-NEXT: v_mov_b32_e32 v7, s15
+; SI-NEXT: v_mov_b32_e32 v5, s14
+; SI-NEXT: v_mov_b32_e32 v6, s40
+; SI-NEXT: v_mov_b32_e32 v7, s41
; SI-NEXT: v_mov_b32_e32 v8, v17
; SI-NEXT: v_mov_b32_e32 v11, v19
; SI-NEXT: v_mov_b32_e32 v12, v18
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index e5a1260fa4538..5b82535bf956a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -4076,7 +4076,7 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s63, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s61, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24
@@ -4102,14 +4102,14 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s61
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3
@@ -4150,12 +4150,12 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s62
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, s61
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s12
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
@@ -4291,8 +4291,8 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
@@ -4327,7 +4327,7 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s63, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s61, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24
@@ -4353,14 +4353,14 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s61
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3
@@ -4401,12 +4401,12 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-FAKE16-NEXT: .LBB13_3: ; %end
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s62, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s61, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s63, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s62, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s12, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5
; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9
@@ -4506,8 +4506,8 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB13_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59
@@ -16547,10 +16547,9 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4
; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6
@@ -16563,47 +16562,48 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20
-; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; kill: killed $vgpr51
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB48_2
@@ -16628,10 +16628,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v6, v1, v25
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19
; SI-NEXT: v_or_b32_e32 v2, v1, v24
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_bfe_u32 v1, v4, 8, 8
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20
; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24
; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8
@@ -16647,20 +16647,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24
; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16
; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31
-; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22
-; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18
-; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10
+; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v31
+; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v22
+; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v18
; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2
-; SI-NEXT: v_and_b32_e32 v45, 0xffff, v8
-; SI-NEXT: v_and_b32_e32 v42, 0xffff, v12
-; SI-NEXT: v_and_b32_e32 v55, 0xffff, v16
-; SI-NEXT: v_and_b32_e32 v51, 0xffff, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: v_bfe_u32 v46, v8, 8, 8
-; SI-NEXT: v_bfe_u32 v43, v12, 8, 8
-; SI-NEXT: v_bfe_u32 v40, v16, 8, 8
-; SI-NEXT: v_bfe_u32 v53, v20, 8, 8
+; SI-NEXT: v_and_b32_e32 v46, 0xffff, v4
+; SI-NEXT: v_and_b32_e32 v43, 0xffff, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff, v12
+; SI-NEXT: v_and_b32_e32 v53, 0xffff, v16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v47, v4, 8, 8
+; SI-NEXT: v_bfe_u32 v44, v8, 8, 8
+; SI-NEXT: v_bfe_u32 v41, v12, 8, 8
+; SI-NEXT: v_bfe_u32 v54, v16, 8, 8
+; SI-NEXT: v_bfe_u32 v51, v20, 8, 8
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
@@ -16682,10 +16682,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: .LBB48_2: ; %Flow
-; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
+; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB48_4
; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13
@@ -16730,6 +16727,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4
; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24
; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16
; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8
@@ -16745,22 +16743,23 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24
; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16
; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v31
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31
-; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v22
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22
-; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v18
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18
-; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v10
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10
-; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v2
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v31
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v31
+; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v22
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v22
+; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v18
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v18
+; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v10
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: .LBB48_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
@@ -16776,13 +16775,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v31
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v45
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v8
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v46
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v47
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -16802,11 +16799,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v42
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v45
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v43
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v46
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v44
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -16826,11 +16823,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v41
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v55
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v40
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v43
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v41
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -16848,13 +16845,15 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v54
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v54
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v55
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v53
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
@@ -16876,13 +16875,15 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v51
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -16917,40 +16918,27 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr16
; VI-NEXT: ; implicit-def: $vgpr15
-; VI-NEXT: ; implicit-def: $vgpr32
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr14
-; VI-NEXT: ; implicit-def: $vgpr37
; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr14
; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: ; implicit-def: $vgpr13
-; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr12
-; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: ; implicit-def: $vgpr30
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: ; implicit-def: $vgpr28
-; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr27
; VI-NEXT: ; implicit-def: $vgpr29
+; VI-NEXT: ; implicit-def: $vgpr11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB48_2
@@ -16959,37 +16947,22 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8]
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6]
; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4]
+; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v10
; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8
; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v2
-; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1
-; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v34, v1
-; VI-NEXT: v_mov_b32_e32 v32, v2
-; VI-NEXT: v_mov_b32_e32 v38, v3
-; VI-NEXT: v_mov_b32_e32 v37, v4
-; VI-NEXT: v_mov_b32_e32 v50, v5
-; VI-NEXT: v_mov_b32_e32 v49, v6
-; VI-NEXT: v_mov_b32_e32 v55, v7
-; VI-NEXT: v_mov_b32_e32 v53, v8
-; VI-NEXT: v_mov_b32_e32 v43, v9
-; VI-NEXT: v_mov_b32_e32 v42, v10
-; VI-NEXT: ; implicit-def: $vgpr1
-; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: ; implicit-def: $vgpr9
+; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v6
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v4
+; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v4
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3
+; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; VI-NEXT: .LBB48_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB48_4
@@ -16999,137 +16972,144 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; VI-NEXT: v_add_u16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v42, 3, v10
+; VI-NEXT: v_add_u16_e32 v55, 3, v10
; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v17
-; VI-NEXT: v_add_u16_e32 v43, 3, v9
+; VI-NEXT: v_add_u16_e32 v40, 3, v9
; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20
; VI-NEXT: v_add_u16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_e32 v53, 3, v8
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18
-; VI-NEXT: v_add_u16_e32 v55, 3, v7
+; VI-NEXT: v_add_u16_e32 v54, 3, v7
; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v22
-; VI-NEXT: v_or_b32_e32 v10, v42, v10
-; VI-NEXT: v_or_b32_e32 v9, v43, v9
+; VI-NEXT: v_or_b32_e32 v10, v55, v10
+; VI-NEXT: v_or_b32_e32 v9, v40, v9
; VI-NEXT: v_add_u16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v49, 3, v6
+; VI-NEXT: v_add_u16_e32 v51, 3, v6
; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v19
-; VI-NEXT: v_add_u16_e32 v50, 3, v5
+; VI-NEXT: v_add_u16_e32 v52, 3, v5
; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v24
; VI-NEXT: v_or_b32_e32 v8, v53, v8
-; VI-NEXT: v_or_b32_e32 v7, v55, v7
+; VI-NEXT: v_or_b32_e32 v7, v54, v7
; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
-; VI-NEXT: v_add_u16_e32 v37, 3, v4
+; VI-NEXT: v_add_u16_e32 v49, 3, v4
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v21
-; VI-NEXT: v_add_u16_e32 v38, 3, v3
+; VI-NEXT: v_add_u16_e32 v50, 3, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v25
-; VI-NEXT: v_or_b32_e32 v6, v49, v6
-; VI-NEXT: v_or_b32_e32 v5, v50, v5
+; VI-NEXT: v_or_b32_e32 v6, v51, v6
+; VI-NEXT: v_or_b32_e32 v5, v52, v5
; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8]
-; VI-NEXT: v_add_u16_e32 v32, 3, v2
+; VI-NEXT: v_add_u16_e32 v39, 3, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23
-; VI-NEXT: v_add_u16_e32 v34, 3, v1
+; VI-NEXT: v_add_u16_e32 v48, 3, v1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26
-; VI-NEXT: v_or_b32_e32 v4, v37, v4
-; VI-NEXT: v_or_b32_e32 v3, v38, v3
+; VI-NEXT: v_or_b32_e32 v4, v49, v4
+; VI-NEXT: v_or_b32_e32 v3, v50, v3
; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6]
-; VI-NEXT: v_or_b32_e32 v2, v32, v2
-; VI-NEXT: v_or_b32_e32 v1, v34, v1
+; VI-NEXT: v_or_b32_e32 v2, v39, v2
+; VI-NEXT: v_or_b32_e32 v1, v48, v1
; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4]
; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10
; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9
; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8
; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v4
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; VI-NEXT: v_bfe_u32 v29, v17, 8, 8
-; VI-NEXT: v_bfe_u32 v33, v18, 8, 8
-; VI-NEXT: v_bfe_u32 v39, v19, 8, 8
-; VI-NEXT: v_bfe_u32 v52, v21, 8, 8
-; VI-NEXT: v_bfe_u32 v41, v23, 8, 8
+; VI-NEXT: v_bfe_u32 v32, v18, 8, 8
+; VI-NEXT: v_bfe_u32 v35, v19, 8, 8
+; VI-NEXT: v_mov_b32_e32 v1, v48
+; VI-NEXT: v_mov_b32_e32 v2, v39
+; VI-NEXT: v_mov_b32_e32 v3, v50
+; VI-NEXT: v_mov_b32_e32 v4, v49
+; VI-NEXT: v_mov_b32_e32 v5, v52
+; VI-NEXT: v_mov_b32_e32 v6, v51
+; VI-NEXT: v_mov_b32_e32 v7, v54
+; VI-NEXT: v_mov_b32_e32 v8, v53
+; VI-NEXT: v_mov_b32_e32 v9, v40
+; VI-NEXT: v_mov_b32_e32 v10, v55
+; VI-NEXT: v_bfe_u32 v39, v21, 8, 8
+; VI-NEXT: v_bfe_u32 v48, v23, 8, 8
; VI-NEXT: .LBB48_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15
-; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15
+; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16
+; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48
; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14
-; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52
-; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13
-; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39
-; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35
+; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12
-; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v33
-; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29
-; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -20353,8 +20333,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: v_readfirstlane_b32 s14, v19
-; SI-NEXT: v_readfirstlane_b32 s40, v18
-; SI-NEXT: v_readfirstlane_b32 s12, v11
+; SI-NEXT: v_readfirstlane_b32 s15, v18
+; SI-NEXT: v_readfirstlane_b32 s11, v11
; SI-NEXT: v_readfirstlane_b32 s13, v10
; SI-NEXT: v_readfirstlane_b32 s8, v3
; SI-NEXT: v_readfirstlane_b32 s9, v2
@@ -20380,22 +20360,22 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_lshl_b32 s10, s23, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s10, s5
-; SI-NEXT: s_or_b32 s11, s4, s5
+; SI-NEXT: s_or_b32 s12, s4, s5
; SI-NEXT: s_and_b32 s4, s18, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s10, s19, 24
; SI-NEXT: s_or_b32 s4, s10, s4
; SI-NEXT: s_and_b32 s10, s28, 0xff
-; SI-NEXT: s_lshl_b32 s15, s29, 8
-; SI-NEXT: s_or_b32 s10, s10, s15
-; SI-NEXT: s_and_b32 s15, s6, 0xff
-; SI-NEXT: s_lshl_b32 s15, s15, 16
+; SI-NEXT: s_lshl_b32 s40, s29, 8
+; SI-NEXT: s_or_b32 s10, s10, s40
+; SI-NEXT: s_and_b32 s40, s6, 0xff
+; SI-NEXT: s_lshl_b32 s40, s40, 16
; SI-NEXT: s_lshl_b32 s41, s7, 24
-; SI-NEXT: s_or_b32 s43, s41, s15
-; SI-NEXT: s_and_b32 s15, s26, 0xff
-; SI-NEXT: s_lshl_b32 s15, s15, 16
+; SI-NEXT: s_or_b32 s43, s41, s40
+; SI-NEXT: s_and_b32 s40, s26, 0xff
+; SI-NEXT: s_lshl_b32 s40, s40, 16
; SI-NEXT: s_lshl_b32 s41, s27, 24
-; SI-NEXT: s_or_b32 s15, s41, s15
+; SI-NEXT: s_or_b32 s40, s41, s40
; SI-NEXT: s_and_b32 s41, s16, 0xff
; SI-NEXT: s_lshl_b32 s42, s17, 8
; SI-NEXT: s_or_b32 s41, s41, s42
@@ -20412,12 +20392,12 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: v_or_b32_e32 v11, v0, v10
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_mov_b32_e32 v5, s15
+; SI-NEXT: v_mov_b32_e32 v5, s40
; SI-NEXT: v_or_b32_e32 v10, v9, v11
; SI-NEXT: v_and_b32_e32 v9, 0xff, v4
; SI-NEXT: v_and_b32_e32 v13, 0xff, v14
; SI-NEXT: v_and_b32_e32 v17, 0xff, v16
-; SI-NEXT: s_or_b32 s15, s4, s15
+; SI-NEXT: s_or_b32 s40, s4, s40
; SI-NEXT: s_and_b32 s4, s9, 0xff
; SI-NEXT: s_lshl_b32 s42, s8, 8
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -20434,7 +20414,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_and_b32_e32 v18, 0xff, v24
; SI-NEXT: v_or_b32_e32 v23, s4, v15
; SI-NEXT: s_and_b32 s4, s13, 0xff
-; SI-NEXT: s_lshl_b32 s42, s12, 8
+; SI-NEXT: s_lshl_b32 s42, s11, 8
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_or_b32_e32 v17, v17, v30
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
@@ -20446,7 +20426,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_or_b32_e32 v18, v17, v32
; SI-NEXT: v_and_b32_e32 v17, 0xff, v20
; SI-NEXT: v_or_b32_e32 v26, s4, v21
-; SI-NEXT: s_and_b32 s4, s40, 0xff
+; SI-NEXT: s_and_b32 s4, s15, 0xff
; SI-NEXT: s_lshl_b32 s42, s14, 8
; SI-NEXT: s_and_b32 s10, s10, 0xffff
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
@@ -20454,7 +20434,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_or_b32 s10, s10, s43
; SI-NEXT: v_or_b32_e32 v33, v31, v17
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16
+; SI-NEXT: v_alignbit_b32 v1, s12, v1, 16
; SI-NEXT: v_alignbit_b32 v5, s10, v5, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16
; SI-NEXT: v_alignbit_b32 v13, v25, v21, 16
@@ -20467,8 +20447,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32
; SI-NEXT: s_cbranch_execnz .LBB51_3
; SI-NEXT: .LBB51_2: ; %cmp.true
-; SI-NEXT: s_add_i32 s40, s40, 3
-; SI-NEXT: s_and_b32 s4, s40, 0xff
+; SI-NEXT: s_add_i32 s15, s15, 3
+; SI-NEXT: s_and_b32 s4, s15, 0xff
; SI-NEXT: s_lshl_b32 s5, s14, 8
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -20492,7 +20472,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_add_i32 s13, s13, 3
; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1
; SI-NEXT: s_and_b32 s4, s13, 0xff
-; SI-NEXT: s_lshl_b32 s5, s12, 8
+; SI-NEXT: s_lshl_b32 s5, s11, 8
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
@@ -20537,7 +20517,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s28, s28, 3
-; SI-NEXT: s_add_i32 s15, s4, 0x3000000
+; SI-NEXT: s_add_i32 s40, s4, 0x3000000
; SI-NEXT: s_and_b32 s4, s28, 0xff
; SI-NEXT: s_lshl_b32 s5, s29, 8
; SI-NEXT: s_add_i32 s6, s6, 3
@@ -20587,24 +20567,24 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0
-; SI-NEXT: s_add_i32 s11, s4, 0x3000000
+; SI-NEXT: s_add_i32 s12, s4, 0x3000000
; SI-NEXT: v_mov_b32_e32 v0, s41
-; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16
-; SI-NEXT: v_mov_b32_e32 v0, s15
+; SI-NEXT: v_alignbit_b32 v1, s12, v0, 16
+; SI-NEXT: v_mov_b32_e32 v0, s40
; SI-NEXT: v_alignbit_b32 v5, s10, v0, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v23, 16
; SI-NEXT: v_alignbit_b32 v13, v25, v26, 16
; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16
-; SI-NEXT: s_lshr_b32 s42, s11, 16
+; SI-NEXT: s_lshr_b32 s42, s12, 16
; SI-NEXT: s_lshr_b32 s43, s10, 16
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18
; SI-NEXT: .LBB51_3: ; %end
; SI-NEXT: v_mov_b32_e32 v0, s41
-; SI-NEXT: v_mov_b32_e32 v2, s11
+; SI-NEXT: v_mov_b32_e32 v2, s12
; SI-NEXT: v_mov_b32_e32 v3, s42
-; SI-NEXT: v_mov_b32_e32 v4, s15
+; SI-NEXT: v_mov_b32_e32 v4, s40
; SI-NEXT: v_mov_b32_e32 v6, s10
; SI-NEXT: v_mov_b32_e32 v7, s43
; SI-NEXT: v_mov_b32_e32 v8, v23
@@ -20615,9 +20595,9 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; SI-NEXT: .LBB51_4:
; SI-NEXT: ; implicit-def: $sgpr41
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $sgpr11
+; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $sgpr15
+; SI-NEXT: ; implicit-def: $sgpr40
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $sgpr10
; SI-NEXT: ; implicit-def: $sgpr43
@@ -38803,7 +38783,7 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s63, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s61, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24
@@ -38829,14 +38809,14 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s61
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3
; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3
@@ -38877,12 +38857,12 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-TRUE16-NEXT: .LBB79_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s62
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, s61
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s12
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
@@ -39018,8 +38998,8 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB79_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
@@ -39054,7 +39034,7 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s63, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s61, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24
@@ -39080,14 +39060,14 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s61
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3
; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3
@@ -39128,12 +39108,12 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8
; GFX11-FAKE16-NEXT: .LBB79_3: ; %end
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s62, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s61, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s63, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s62, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s12, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5
; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9
@@ -39233,8 +39213,8 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB79_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 7b756bce857bc..fbdee6eea327e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -6112,10 +6112,8 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
; VI-LABEL: bitcast_v2i16_to_v4i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: ; implicit-def: $vgpr3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -6128,20 +6126,19 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB56_3: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
-; VI-NEXT: v_mov_b32_e32 v0, v4
-; VI-NEXT: ; implicit-def: $vgpr4
+; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB56_2
; VI-NEXT: .LBB56_4: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v1, 3
-; VI-NEXT: v_add_u16_sdwa v2, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v0, 3, v4
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; VI-NEXT: v_or_b32_e32 v1, v0, v1
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v4, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; VI-NEXT: v_or_b32_e32 v0, v4, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; VI-NEXT: v_bfe_u32 v3, v2, 8, 8
+; VI-NEXT: v_mov_b32_e32 v0, v4
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -6292,31 +6289,30 @@ define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inre
; VI-NEXT: s_cmp_lg_u32 s17, 0
; VI-NEXT: s_cbranch_scc0 .LBB57_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s8, s16, 24
+; VI-NEXT: s_lshr_b32 s7, s16, 24
; VI-NEXT: s_lshr_b32 s6, s16, 16
-; VI-NEXT: s_lshr_b32 s9, s16, 8
-; VI-NEXT: s_mov_b32 s7, s16
+; VI-NEXT: s_lshr_b32 s8, s16, 8
; VI-NEXT: s_cbranch_execnz .LBB57_3
; VI-NEXT: .LBB57_2: ; %cmp.true
-; VI-NEXT: s_lshr_b32 s5, s16, 16
-; VI-NEXT: s_add_i32 s7, s16, 3
-; VI-NEXT: s_add_i32 s6, s5, 3
-; VI-NEXT: s_and_b32 s4, s7, 0xffff
-; VI-NEXT: s_lshl_b32 s5, s6, 16
-; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_lshr_b32 s9, s4, 8
-; VI-NEXT: s_bfe_u32 s8, s6, 0x80008
+; VI-NEXT: s_lshr_b32 s6, s16, 16
+; VI-NEXT: s_add_i32 s4, s16, 3
+; VI-NEXT: s_add_i32 s6, s6, 3
+; VI-NEXT: s_and_b32 s5, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s6, 16
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_lshr_b32 s8, s5, 8
+; VI-NEXT: s_bfe_u32 s7, s6, 0x80008
; VI-NEXT: .LBB57_3: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB57_4:
-; VI-NEXT: ; implicit-def: $sgpr7
-; VI-NEXT: ; implicit-def: $sgpr9
-; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr7
; VI-NEXT: s_branch .LBB57_2
;
; GFX9-LABEL: bitcast_v2i16_to_v4i8_scalar:
@@ -9174,8 +9170,8 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
@@ -9204,6 +9200,7 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-TRUE16-NEXT: .LBB76_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 1024c2a7f066a..68498462f489e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -64649,11 +64649,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32
-; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
@@ -64728,9 +64727,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10
@@ -64746,8 +64746,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v30
; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; kill: killed $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr50
@@ -64758,11 +64758,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; kill: killed $vgpr2
@@ -64771,8 +64770,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; kill: killed $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -64785,207 +64784,204 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v56, v1, v63
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT: v_or_b32_e32 v47, v1, v27
+; SI-NEXT: v_or_b32_e32 v47, v1, v62
; SI-NEXT: v_alignbit_b32 v1, v47, v56, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5
; SI-NEXT: v_or_b32_e32 v50, v1, v37
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v38, v1, v36
; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9
; SI-NEXT: v_or_b32_e32 v34, v1, v48
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11
; SI-NEXT: v_or_b32_e32 v33, v1, v39
; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13
; SI-NEXT: v_or_b32_e32 v32, v1, v51
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
; SI-NEXT: v_or_b32_e32 v31, v1, v49
; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17
; SI-NEXT: v_or_b32_e32 v30, v1, v53
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19
; SI-NEXT: v_or_b32_e32 v26, v1, v52
; SI-NEXT: v_alignbit_b32 v1, v26, v30, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21
; SI-NEXT: v_or_b32_e32 v22, v1, v55
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23
; SI-NEXT: v_or_b32_e32 v18, v1, v54
+; SI-NEXT: v_alignbit_b32 v1, v18, v22, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25
; SI-NEXT: v_or_b32_e32 v14, v1, v41
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24
-; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16
-; SI-NEXT: v_bfe_u32 v62, v44, 8, 8
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27
; SI-NEXT: v_or_b32_e32 v10, v1, v40
; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29
; SI-NEXT: v_or_b32_e32 v6, v1, v43
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35
; SI-NEXT: v_or_b32_e32 v2, v1, v42
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v4, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v12, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v16, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v20, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v24, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_bfe_u32 v1, v28, 8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24
; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24
-; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16
+; SI-NEXT: v_alignbit_b32 v61, v10, v14, 16
; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24
; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16
; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_u32 v58, v44, 8, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: .LBB96_2: ; %Flow
; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v20, v61
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB96_4
; SI-NEXT: ; %bb.3: ; %cmp.true
@@ -64995,35 +64991,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: s_mov_b32 s6, 0x30000
; SI-NEXT: v_or_b32_e32 v4, v41, v4
; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v63, v1
-; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v27, v1
-; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1
-; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v43, v2
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v42, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24
-; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16
-; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8
-; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: v_or_b32_e32 v4, v40, v4
; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4
@@ -65057,121 +65025,157 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_or_b32_e32 v4, v39, v4
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5
+; SI-NEXT: v_or_b32_e32 v1, v63, v1
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1
+; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3
; SI-NEXT: v_or_b32_e32 v4, v37, v4
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7
+; SI-NEXT: v_or_b32_e32 v1, v62, v1
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1
; SI-NEXT: v_or_b32_e32 v4, v36, v4
+; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16
; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8
+; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v31
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v2, v43, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v2, v42, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; SI-NEXT: v_alignbit_b32 v4, v47, v56, 24
-; SI-NEXT: v_alignbit_b32 v24, v31, v32, 24
-; SI-NEXT: v_alignbit_b32 v28, v31, v32, 16
-; SI-NEXT: v_alignbit_b32 v12, v26, v30, 24
-; SI-NEXT: v_alignbit_b32 v16, v26, v30, 16
-; SI-NEXT: v_alignbit_b32 v44, v26, v30, 8
+; SI-NEXT: v_alignbit_b32 v44, v26, v30, 24
; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24
; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16
-; SI-NEXT: v_alignbit_b32 v20, v18, v22, 8
; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24
-; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16
-; SI-NEXT: v_alignbit_b32 v8, v10, v14, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v10, v14, 16
+; SI-NEXT: v_alignbit_b32 v28, v10, v14, 8
+; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24
+; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16
+; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8
+; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: .LBB96_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v56
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -65179,7 +65183,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
@@ -65187,14 +65191,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v47
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -65205,14 +65209,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v50
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
@@ -65223,14 +65227,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v38
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -65241,14 +65245,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v34
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
@@ -65259,14 +65263,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v33
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -65277,28 +65281,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v32
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v28
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v31
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -65309,26 +65317,30 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v30
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v44
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v26
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
@@ -65339,26 +65351,28 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_and_b32_e32 v3, 0xff, v61
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
@@ -65371,9 +65385,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v8
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v28
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v58
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v20
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -65381,14 +65395,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v10
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
@@ -65413,13 +65427,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -65448,28 +65462,27 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
;
; VI-LABEL: bitcast_v32i16_to_v64i8:
; VI: ; %bb.0:
-; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
+; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1
; VI-NEXT: ; kill: killed $vgpr17
; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -65486,415 +65499,312 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; kill: killed $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr18
+; VI-NEXT: ; implicit-def: $vgpr26
+; VI-NEXT: ; implicit-def: $vgpr23
+; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr22
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr63
; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr51
; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr61
; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; kill: killed $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr17
-; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; kill: killed $vgpr19
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr28
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr27
; VI-NEXT: ; implicit-def: $vgpr25
-; VI-NEXT: ; implicit-def: $vgpr24
-; VI-NEXT: ; implicit-def: $vgpr23
-; VI-NEXT: ; implicit-def: $vgpr22
; VI-NEXT: ; implicit-def: $vgpr21
; VI-NEXT: ; implicit-def: $vgpr20
; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr18
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB96_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
+; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16
-; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v2
-; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1
-; VI-NEXT: v_mov_b32_e32 v25, v50
-; VI-NEXT: v_mov_b32_e32 v41, v1
-; VI-NEXT: v_mov_b32_e32 v54, v2
-; VI-NEXT: v_mov_b32_e32 v57, v3
-; VI-NEXT: v_mov_b32_e32 v47, v4
-; VI-NEXT: v_mov_b32_e32 v61, v5
-; VI-NEXT: v_mov_b32_e32 v60, v6
-; VI-NEXT: v_mov_b32_e32 v52, v7
-; VI-NEXT: v_mov_b32_e32 v63, v8
-; VI-NEXT: v_mov_b32_e32 v40, v9
-; VI-NEXT: v_mov_b32_e32 v53, v10
-; VI-NEXT: v_mov_b32_e32 v17, v11
-; VI-NEXT: v_mov_b32_e32 v44, v12
-; VI-NEXT: v_mov_b32_e32 v58, v13
-; VI-NEXT: v_mov_b32_e32 v56, v14
-; VI-NEXT: v_mov_b32_e32 v50, v15
-; VI-NEXT: v_mov_b32_e32 v62, v16
-; VI-NEXT: ; implicit-def: $vgpr1
-; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: ; implicit-def: $vgpr13
-; VI-NEXT: ; implicit-def: $vgpr15
+; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6]
+; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4]
+; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v16
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16
+; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v15
+; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v12
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v11
+; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v10
+; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v10
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v9
+; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v8
+; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v7
+; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v6
+; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v63, 24, v4
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4
+; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v3
+; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v2
+; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2]
+; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v1
; VI-NEXT: .LBB96_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB96_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_mov_b32_e32 v18, 3
-; VI-NEXT: v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v29, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v62, 3, v16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v26
-; VI-NEXT: v_add_u16_e32 v50, 3, v15
-; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29
-; VI-NEXT: v_or_b32_e32 v16, v62, v16
-; VI-NEXT: v_or_b32_e32 v15, v50, v15
-; VI-NEXT: v_add_u16_sdwa v38, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v49, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v36, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v48, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v34, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v39, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v32, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v37, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v30, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v35, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v28, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v33, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v27, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v31, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16]
-; VI-NEXT: v_add_u16_e32 v56, 3, v14
-; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27
-; VI-NEXT: v_add_u16_e32 v58, 3, v13
-; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31
-; VI-NEXT: v_or_b32_e32 v14, v56, v14
-; VI-NEXT: v_or_b32_e32 v13, v58, v13
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v17, 3
+; VI-NEXT: v_add_u16_sdwa v50, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v53, 3, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v50
+; VI-NEXT: v_add_u16_sdwa v52, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v28, v53, v2
+; VI-NEXT: v_add_u16_e32 v2, 3, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52
+; VI-NEXT: v_add_u16_sdwa v48, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v27, v2, v1
+; VI-NEXT: v_add_u16_e32 v54, 3, v4
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v48
+; VI-NEXT: v_add_u16_sdwa v51, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v24, v54, v1
+; VI-NEXT: v_add_u16_e32 v4, 3, v3
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v51
+; VI-NEXT: v_add_u16_sdwa v37, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v23, v4, v1
+; VI-NEXT: v_add_u16_e32 v56, 3, v6
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37
+; VI-NEXT: v_add_u16_sdwa v49, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v26, v56, v1
+; VI-NEXT: v_add_u16_e32 v6, 3, v5
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49
+; VI-NEXT: v_add_u16_sdwa v35, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v25, v6, v1
+; VI-NEXT: v_add_u16_e32 v57, 3, v8
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v35
+; VI-NEXT: v_add_u16_sdwa v39, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v47, v57, v1
+; VI-NEXT: v_add_u16_e32 v8, 3, v7
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39
+; VI-NEXT: v_add_u16_sdwa v33, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v46, v8, v1
+; VI-NEXT: v_add_u16_e32 v58, 3, v10
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33
+; VI-NEXT: v_add_u16_sdwa v38, v9, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v10, v58, v1
+; VI-NEXT: v_add_u16_e32 v59, 3, v9
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v38
+; VI-NEXT: v_add_u16_sdwa v31, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v9, v59, v1
+; VI-NEXT: v_add_u16_e32 v60, 3, v12
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31
+; VI-NEXT: v_add_u16_sdwa v36, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v12, v60, v1
+; VI-NEXT: v_add_u16_e32 v61, 3, v11
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36
+; VI-NEXT: v_add_u16_sdwa v30, v14, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v11, v61, v1
+; VI-NEXT: v_add_u16_e32 v62, 3, v14
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; VI-NEXT: v_add_u16_sdwa v34, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v14, v62, v1
+; VI-NEXT: v_add_u16_e32 v63, 3, v13
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v34
+; VI-NEXT: v_add_u16_sdwa v29, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v13, v63, v1
+; VI-NEXT: v_add_u16_e32 v55, 3, v16
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29
+; VI-NEXT: v_add_u16_sdwa v32, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v16, v55, v1
+; VI-NEXT: v_add_u16_e32 v40, 3, v15
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32
+; VI-NEXT: v_or_b32_e32 v15, v40, v1
+; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
-; VI-NEXT: v_add_u16_e32 v44, 3, v12
-; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28
-; VI-NEXT: v_add_u16_e32 v17, 3, v11
-; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v33
-; VI-NEXT: v_add_u16_e32 v53, 3, v10
-; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v30
-; VI-NEXT: v_add_u16_e32 v40, 3, v9
-; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v35
-; VI-NEXT: v_or_b32_e32 v12, v44, v12
-; VI-NEXT: v_or_b32_e32 v11, v17, v11
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_add_u16_e32 v63, 3, v8
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v32
-; VI-NEXT: v_add_u16_e32 v52, 3, v7
-; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v37
-; VI-NEXT: v_or_b32_e32 v10, v53, v10
-; VI-NEXT: v_or_b32_e32 v9, v40, v9
; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_add_u16_e32 v60, 3, v6
-; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v34
-; VI-NEXT: v_add_u16_e32 v61, 3, v5
-; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v39
-; VI-NEXT: v_or_b32_e32 v8, v63, v8
-; VI-NEXT: v_or_b32_e32 v7, v52, v7
; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: v_add_u16_e32 v47, 3, v4
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v36
-; VI-NEXT: v_add_u16_e32 v57, 3, v3
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48
-; VI-NEXT: v_or_b32_e32 v6, v60, v6
-; VI-NEXT: v_or_b32_e32 v5, v61, v5
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_add_u16_e32 v54, 3, v2
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v38
-; VI-NEXT: v_add_u16_e32 v41, 3, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49
-; VI-NEXT: v_or_b32_e32 v4, v47, v4
-; VI-NEXT: v_or_b32_e32 v3, v57, v3
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
-; VI-NEXT: v_or_b32_e32 v2, v54, v2
-; VI-NEXT: v_or_b32_e32 v1, v41, v1
-; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1
-; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; VI-NEXT: v_bfe_u32 v1, v27, 8, 8
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v28, 8, 8
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v30, 8, 8
-; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2
-; VI-NEXT: v_bfe_u32 v25, v26, 8, 8
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_bfe_u32 v1, v32, 8, 8
-; VI-NEXT: v_bfe_u32 v43, v34, 8, 8
-; VI-NEXT: v_bfe_u32 v46, v36, 8, 8
-; VI-NEXT: v_bfe_u32 v59, v38, 8, 8
-; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[46:47]
+; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v16
+; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v15
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14
+; VI-NEXT: v_mov_b32_e32 v14, v62
+; VI-NEXT: v_mov_b32_e32 v15, v40
+; VI-NEXT: v_mov_b32_e32 v40, v16
+; VI-NEXT: v_mov_b32_e32 v16, v55
+; VI-NEXT: v_mov_b32_e32 v55, v22
+; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v23
+; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[23:24]
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13
+; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v11
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v9
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47
+; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v26
+; VI-NEXT: v_mov_b32_e32 v9, v59
+; VI-NEXT: v_mov_b32_e32 v11, v61
+; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[25:26]
+; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v24
+; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[27:28]
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v10
+; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v46
+; VI-NEXT: v_mov_b32_e32 v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, v53
+; VI-NEXT: v_mov_b32_e32 v3, v4
+; VI-NEXT: v_mov_b32_e32 v4, v54
+; VI-NEXT: v_mov_b32_e32 v5, v6
+; VI-NEXT: v_mov_b32_e32 v6, v56
+; VI-NEXT: v_mov_b32_e32 v7, v8
+; VI-NEXT: v_mov_b32_e32 v8, v57
+; VI-NEXT: v_mov_b32_e32 v10, v58
+; VI-NEXT: v_mov_b32_e32 v12, v60
+; VI-NEXT: v_mov_b32_e32 v13, v63
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v28
+; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v27
+; VI-NEXT: v_bfe_u32 v27, v29, 8, 8
+; VI-NEXT: v_bfe_u32 v28, v30, 8, 8
+; VI-NEXT: v_bfe_u32 v56, v31, 8, 8
+; VI-NEXT: v_bfe_u32 v57, v33, 8, 8
+; VI-NEXT: v_bfe_u32 v58, v35, 8, 8
+; VI-NEXT: v_bfe_u32 v60, v37, 8, 8
+; VI-NEXT: v_bfe_u32 v63, v48, 8, 8
+; VI-NEXT: v_bfe_u32 v54, v50, 8, 8
; VI-NEXT: .LBB96_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v18
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24
-; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26
+; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23
+; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v23, v52, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v53
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59
-; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54
+; VI-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23
-; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v22
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62
+; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46
-; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v63
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22
-; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25
+; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43
-; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60
+; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21
-; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20
-; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57
+; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56
+; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
+; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v28
+; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17
+; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25
-; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27
+; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -71641,7 +71551,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68
-; SI-NEXT: v_readfirstlane_b32 s15, v27
+; SI-NEXT: v_readfirstlane_b32 s14, v27
; SI-NEXT: v_readfirstlane_b32 s40, v26
; SI-NEXT: v_readfirstlane_b32 s12, v19
; SI-NEXT: v_readfirstlane_b32 s13, v18
@@ -71690,17 +71600,17 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s22, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s14, s23, 24
+; SI-NEXT: s_lshl_b32 s15, s23, 24
; SI-NEXT: s_and_b32 s4, s4, 0xffff
-; SI-NEXT: s_or_b32 s5, s14, s5
+; SI-NEXT: s_or_b32 s5, s15, s5
; SI-NEXT: s_or_b32 s41, s4, s5
; SI-NEXT: s_and_b32 s4, s18, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
-; SI-NEXT: s_lshl_b32 s14, s19, 24
-; SI-NEXT: s_or_b32 s4, s14, s4
-; SI-NEXT: s_and_b32 s14, s28, 0xff
+; SI-NEXT: s_lshl_b32 s15, s19, 24
+; SI-NEXT: s_or_b32 s4, s15, s4
+; SI-NEXT: s_and_b32 s15, s28, 0xff
; SI-NEXT: s_lshl_b32 s46, s29, 8
-; SI-NEXT: s_or_b32 s14, s14, s46
+; SI-NEXT: s_or_b32 s15, s15, s46
; SI-NEXT: s_and_b32 s46, s6, 0xff
; SI-NEXT: s_lshl_b32 s46, s46, 16
; SI-NEXT: s_lshl_b32 s47, s7, 24
@@ -71789,7 +71699,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_or_b32_e32 v63, v59, v34
; SI-NEXT: v_or_b32_e32 v39, s4, v25
; SI-NEXT: s_and_b32 s4, s40, 0xff
-; SI-NEXT: s_lshl_b32 s56, s15, 8
+; SI-NEXT: s_lshl_b32 s56, s14, 8
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v48, v32, v63
; SI-NEXT: v_and_b32_e32 v32, 0xff, v57
@@ -71812,12 +71722,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_or_b32_e32 v33, s4, v33
; SI-NEXT: s_and_b32 s4, s45, 0xff
; SI-NEXT: s_lshl_b32 s56, s44, 8
-; SI-NEXT: s_and_b32 s14, s14, 0xffff
+; SI-NEXT: s_and_b32 s15, s15, 0xffff
; SI-NEXT: s_or_b32 s4, s4, s56
-; SI-NEXT: s_or_b32 s14, s14, s57
+; SI-NEXT: s_or_b32 s15, s15, s57
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_alignbit_b32 v1, s41, v1, 16
-; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16
+; SI-NEXT: v_alignbit_b32 v5, s15, v5, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16
; SI-NEXT: v_alignbit_b32 v13, v36, v23, 16
; SI-NEXT: v_alignbit_b32 v21, v38, v29, 16
@@ -71883,7 +71793,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_add_i32 s40, s40, 3
; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1
; SI-NEXT: s_and_b32 s4, s40, 0xff
-; SI-NEXT: s_lshl_b32 s5, s15, 8
+; SI-NEXT: s_lshl_b32 s5, s14, 8
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
@@ -71987,7 +71897,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: s_add_i32 s14, s4, 0x3000000
+; SI-NEXT: s_add_i32 s15, s4, 0x3000000
; SI-NEXT: s_and_b32 s4, s16, 0xff
; SI-NEXT: s_lshl_b32 s5, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
@@ -72028,7 +71938,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v0, s47
; SI-NEXT: v_alignbit_b32 v1, s41, v0, 16
; SI-NEXT: v_mov_b32_e32 v0, s46
-; SI-NEXT: v_alignbit_b32 v5, s14, v0, 16
+; SI-NEXT: v_alignbit_b32 v5, s15, v0, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v35, 16
; SI-NEXT: v_alignbit_b32 v13, v36, v37, 16
; SI-NEXT: v_alignbit_b32 v17, v18, v39, 16
@@ -72036,7 +71946,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16
; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16
; SI-NEXT: s_lshr_b32 s56, s41, 16
-; SI-NEXT: s_lshr_b32 s57, s14, 16
+; SI-NEXT: s_lshr_b32 s57, s15, 16
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v36
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18
@@ -72065,7 +71975,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v2, s41
; SI-NEXT: v_mov_b32_e32 v3, s56
; SI-NEXT: v_mov_b32_e32 v4, s46
-; SI-NEXT: v_mov_b32_e32 v6, s14
+; SI-NEXT: v_mov_b32_e32 v6, s15
; SI-NEXT: v_mov_b32_e32 v7, s57
; SI-NEXT: v_mov_b32_e32 v8, v35
; SI-NEXT: v_mov_b32_e32 v12, v37
@@ -72085,7 +71995,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr46
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $sgpr14
+; SI-NEXT: ; implicit-def: $sgpr15
; SI-NEXT: ; implicit-def: $sgpr57
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr9
@@ -72192,11 +72102,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v39
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49
+; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v49
; VI-NEXT: s_cbranch_scc0 .LBB99_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -72255,10 +72165,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
@@ -72318,9 +72228,9 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24
; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55
; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -72377,8 +72287,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: s_addk_i32 s6, 0x300
; VI-NEXT: s_addk_i32 s8, 0x300
; VI-NEXT: s_addk_i32 s10, 0x300
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57
; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45
; VI-NEXT: s_addk_i32 s4, 0x300
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_lshl_b32 s7, s7, 16
@@ -72386,8 +72296,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: s_and_b32 s10, s10, 0xffff
; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: s_or_b32 s9, s9, s10
; VI-NEXT: s_or_b32 s7, s7, s8
; VI-NEXT: s_or_b32 s5, s5, s6
@@ -80753,7 +80663,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v52, s44
-; VI-NEXT: v_mov_b32_e32 v19, s67
; VI-NEXT: v_mov_b32_e32 v12, s66
; VI-NEXT: v_mov_b32_e32 v20, s65
; VI-NEXT: v_mov_b32_e32 v13, s64
@@ -80812,6 +80721,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
; VI-NEXT: v_mov_b32_e32 v4, s14
; VI-NEXT: v_mov_b32_e32 v3, s40
; VI-NEXT: v_mov_b32_e32 v9, s75
+; VI-NEXT: v_mov_b32_e32 v19, s67
; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v52, s62
@@ -85175,11 +85085,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v39
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49
+; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v49
; VI-NEXT: s_cbranch_scc0 .LBB107_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -85238,10 +85148,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
@@ -85301,9 +85211,9 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24
; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55
; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -85360,8 +85270,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: s_addk_i32 s6, 0x300
; VI-NEXT: s_addk_i32 s8, 0x300
; VI-NEXT: s_addk_i32 s10, 0x300
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57
; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45
; VI-NEXT: s_addk_i32 s4, 0x300
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_lshl_b32 s7, s7, 16
@@ -85369,8 +85279,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; VI-NEXT: s_and_b32 s10, s10, 0xffff
; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: s_or_b32 s9, s9, s10
; VI-NEXT: s_or_b32 s7, s7, s8
; VI-NEXT: s_or_b32 s5, s5, s6
@@ -88009,6 +87919,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: ; kill: killed $vgpr17
; GFX9-NEXT: ; implicit-def: $vgpr17
+; GFX9-NEXT: ; kill: killed $vgpr17
+; GFX9-NEXT: ; implicit-def: $vgpr17
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -88025,47 +87937,46 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr27
-; GFX9-NEXT: ; implicit-def: $vgpr28
-; GFX9-NEXT: ; implicit-def: $vgpr23
-; GFX9-NEXT: ; implicit-def: $vgpr51
-; GFX9-NEXT: ; implicit-def: $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr29
-; GFX9-NEXT: ; implicit-def: $vgpr39
-; GFX9-NEXT: ; implicit-def: $vgpr22
+; GFX9-NEXT: ; implicit-def: $vgpr23
+; GFX9-NEXT: ; implicit-def: $vgpr24
+; GFX9-NEXT: ; implicit-def: $vgpr26
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr30
-; GFX9-NEXT: ; implicit-def: $vgpr45
-; GFX9-NEXT: ; implicit-def: $vgpr63
+; GFX9-NEXT: ; implicit-def: $vgpr51
+; GFX9-NEXT: ; implicit-def: $vgpr25
; GFX9-NEXT: ; implicit-def: $vgpr31
-; GFX9-NEXT: ; implicit-def: $vgpr62
+; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr32
-; GFX9-NEXT: ; implicit-def: $vgpr42
-; GFX9-NEXT: ; implicit-def: $vgpr61
+; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr60
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: ; implicit-def: $vgpr53
-; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr61
; GFX9-NEXT: ; implicit-def: $vgpr35
-; GFX9-NEXT: ; implicit-def: $vgpr57
+; GFX9-NEXT: ; implicit-def: $vgpr54
+; GFX9-NEXT: ; implicit-def: $vgpr60
; GFX9-NEXT: ; implicit-def: $vgpr36
-; GFX9-NEXT: ; implicit-def: $vgpr52
-; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr37
-; GFX9-NEXT: ; implicit-def: $vgpr47
+; GFX9-NEXT: ; implicit-def: $vgpr57
; GFX9-NEXT: ; implicit-def: $vgpr38
-; GFX9-NEXT: ; implicit-def: $vgpr46
+; GFX9-NEXT: ; implicit-def: $vgpr56
+; GFX9-NEXT: ; implicit-def: $vgpr39
+; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr48
-; GFX9-NEXT: ; implicit-def: $vgpr44
+; GFX9-NEXT: ; implicit-def: $vgpr45
; GFX9-NEXT: ; implicit-def: $vgpr49
-; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr44
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; kill: killed $vgpr17
-; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: ; implicit-def: $vgpr25
+; GFX9-NEXT: ; implicit-def: $vgpr55
+; GFX9-NEXT: ; implicit-def: $vgpr28
; GFX9-NEXT: ; implicit-def: $vgpr21
; GFX9-NEXT: ; implicit-def: $vgpr20
; GFX9-NEXT: ; implicit-def: $vgpr19
@@ -88081,51 +87992,52 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v12
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v10
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4]
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v15
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v15
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v14
+; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[3:4]
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v14
-; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1
-; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6]
-; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2]
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v1
+; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX9-NEXT: v_lshrrev_b64 v[28:29], 24, v[5:6]
+; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[1:2]
; GFX9-NEXT: .LBB108_2: ; %Flow
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB108_4
@@ -88160,7 +88072,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: s_mov_b32 s7, 0x7060302
; GFX9-NEXT: v_cndmask_b32_e32 v1, v19, v20, vcc
-; GFX9-NEXT: v_perm_b32 v27, v1, v18, s7
+; GFX9-NEXT: v_perm_b32 v23, v1, v18, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX9-NEXT: v_bfe_u32 v19, v1, 16, 1
@@ -88189,7 +88101,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc
-; GFX9-NEXT: v_perm_b32 v29, v3, v19, s7
+; GFX9-NEXT: v_perm_b32 v30, v3, v19, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1
@@ -88218,7 +88130,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v22, vcc
-; GFX9-NEXT: v_perm_b32 v31, v5, v20, s7
+; GFX9-NEXT: v_perm_b32 v32, v5, v20, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8
; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1
@@ -88238,278 +88150,281 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6
-; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v21
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v25, vcc
; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1
; GFX9-NEXT: v_add3_u32 v22, v22, v7, s6
-; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc
-; GFX9-NEXT: v_perm_b32 v33, v7, v21, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v25, vcc
+; GFX9-NEXT: v_perm_b32 v34, v7, v21, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10
; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1
; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX9-NEXT: v_add3_u32 v22, v22, v7, s6
-; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v25, vcc
; GFX9-NEXT: v_bfe_u32 v22, v10, 16, 1
; GFX9-NEXT: v_add3_u32 v22, v22, v10, s6
-; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v10
+; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v10
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v23, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v25, vcc
; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v9
; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX9-NEXT: v_bfe_u32 v25, v22, 16, 1
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6
-; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX9-NEXT: v_add3_u32 v25, v25, v22, s6
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
-; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1
-; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6
-; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v25, v26, vcc
+; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX9-NEXT: v_add3_u32 v25, v25, v9, s6
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc
-; GFX9-NEXT: v_perm_b32 v35, v9, v22, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v26, vcc
+; GFX9-NEXT: v_perm_b32 v36, v9, v22, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12
; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1
+; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1
; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6
-; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9
+; GFX9-NEXT: v_add3_u32 v25, v25, v9, s6
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc
-; GFX9-NEXT: v_bfe_u32 v23, v12, 16, 1
-; GFX9-NEXT: v_add3_u32 v23, v23, v12, s6
-; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v12
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v26, vcc
+; GFX9-NEXT: v_bfe_u32 v25, v12, 16, 1
+; GFX9-NEXT: v_add3_u32 v25, v25, v12, s6
+; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v12
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v23, v24, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v11
-; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v25, v26, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v11
+; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1
; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6
-; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
-; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1
-; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6
-; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX9-NEXT: v_bfe_u32 v26, v11, 16, 1
+; GFX9-NEXT: v_add3_u32 v26, v26, v11, s6
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc
-; GFX9-NEXT: v_perm_b32 v37, v11, v23, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v26, v27, vcc
+; GFX9-NEXT: v_perm_b32 v38, v11, v25, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14
; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX9-NEXT: v_bfe_u32 v26, v11, 16, 1
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6
-; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11
+; GFX9-NEXT: v_add3_u32 v26, v26, v11, s6
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc
-; GFX9-NEXT: v_bfe_u32 v24, v14, 16, 1
-; GFX9-NEXT: v_add3_u32 v24, v24, v14, s6
-; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v26, v27, vcc
+; GFX9-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX9-NEXT: v_add3_u32 v26, v26, v14, s6
+; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v14
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v24, v25, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v13
-; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v26, v27, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v13
+; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6
-; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX9-NEXT: v_add3_u32 v27, v27, v26, s6
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
-; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1
-; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6
-; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX9-NEXT: v_bfe_u32 v27, v13, 16, 1
+; GFX9-NEXT: v_add3_u32 v27, v27, v13, s6
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc
-; GFX9-NEXT: v_perm_b32 v48, v13, v24, s7
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v27, v28, vcc
+; GFX9-NEXT: v_perm_b32 v48, v13, v26, s7
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16
; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1
+; GFX9-NEXT: v_bfe_u32 v27, v13, 16, 1
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6
-; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13
+; GFX9-NEXT: v_add3_u32 v27, v27, v13, s6
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v13
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc
-; GFX9-NEXT: v_bfe_u32 v25, v16, 16, 1
-; GFX9-NEXT: v_add3_u32 v25, v25, v16, s6
-; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v27, v28, vcc
+; GFX9-NEXT: v_bfe_u32 v27, v16, 16, 1
+; GFX9-NEXT: v_add3_u32 v27, v27, v16, s6
+; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v16
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v25, v26, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX9-NEXT: v_cndmask_b32_e32 v16, v27, v28, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v15
+; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6
-; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v25
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX9-NEXT: v_add3_u32 v28, v28, v27, s6
+; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v39, vcc
-; GFX9-NEXT: v_bfe_u32 v26, v15, 16, 1
-; GFX9-NEXT: v_add3_u32 v26, v26, v15, s6
-; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX9-NEXT: v_bfe_u32 v28, v15, 16, 1
+; GFX9-NEXT: v_add3_u32 v28, v28, v15, s6
+; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v15
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v15, v26, v39, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v15, v28, v29, vcc
; GFX9-NEXT: v_perm_b32 v51, v16, v13, s7
-; GFX9-NEXT: v_perm_b32 v50, v15, v25, s7
-; GFX9-NEXT: v_perm_b32 v28, v2, v17, s7
-; GFX9-NEXT: v_perm_b32 v30, v4, v1, s7
+; GFX9-NEXT: v_perm_b32 v50, v15, v27, s7
+; GFX9-NEXT: v_perm_b32 v24, v2, v17, s7
+; GFX9-NEXT: v_perm_b32 v31, v4, v1, s7
; GFX9-NEXT: v_perm_b32 v49, v14, v11, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v18
; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[50:51]
-; GFX9-NEXT: v_perm_b32 v32, v6, v3, s7
-; GFX9-NEXT: v_perm_b32 v38, v12, v9, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6
+; GFX9-NEXT: v_perm_b32 v33, v6, v3, s7
+; GFX9-NEXT: v_perm_b32 v39, v12, v9, s7
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19
; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[48:49]
-; GFX9-NEXT: v_perm_b32 v34, v8, v5, s7
-; GFX9-NEXT: v_perm_b32 v36, v10, v7, s7
-; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v20
-; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[37:38]
-; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v21
-; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[35:36]
+; GFX9-NEXT: v_perm_b32 v35, v8, v5, s7
+; GFX9-NEXT: v_perm_b32 v37, v10, v7, s7
; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v22
-; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[33:34]
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v20
+; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[38:39]
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v26
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[29:30]
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v21
+; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[36:37]
+; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[30:31]
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
-; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v24
-; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[31:32]
-; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[27:28]
-; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v51
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v51
-; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50
-; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v50
-; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v22
+; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[34:35]
+; GFX9-NEXT: v_lshrrev_b64 v[28:29], 24, v[32:33]
+; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[23:24]
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v51
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v51
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v50
+; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v50
+; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v49
; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v49
-; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v48
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
-; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v38
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v39
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v39
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v38
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v38
-; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v37
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v37
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v36
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v35
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v35
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v33
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33
-; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v32
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v32
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v31
; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v31
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30
; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v30
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v28
-; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28
-; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23
; GFX9-NEXT: .LBB108_4: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v27
-; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v23, v28, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v51
+; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v23
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v53
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v26
+; GFX9-NEXT: v_or_b32_sdwa v22, v24, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50
-; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52
+; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v29
-; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v30
+; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v63
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50
; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v32
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v28
; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v32
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v62
; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21
; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60
; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57
; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19
; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v46
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v47
; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -88518,26 +88433,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44
; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17
; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55
; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -88566,62 +88481,62 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -88634,18 +88549,18 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7
@@ -88658,246 +88573,247 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v1.l
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v17, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v28.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v21, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v27
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v20, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v17, v23, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v18, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v22 :: v_dual_cndmask_b32 v3, v4, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v27
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v19, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v29, v19, v22 :: v_dual_add_f32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v17, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v17, v21 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v38.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v6, v17 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v33
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v34.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v6, v17 :: v_dual_and_b32 v23, 0xffff0000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v36
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v23
+; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v29
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v20, 16, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v32
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v7, v21 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v7, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v38.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v35, v19, v22 :: v_dual_lshlrev_b32 v10, 16, v12
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v39 :: v_dual_cndmask_b32 v52, v22, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v22, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v23 :: v_dual_add_f32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v48 :: v_dual_add_f32 v14, 0x40c00000, v14
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v49.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v24, v50, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v24, v51, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v53
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v37
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 24, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v7, 0x7fff
@@ -88906,28 +88822,27 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v68, v21, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v48, v21, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v23, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v64.h
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v15
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v23, v13, 0x7fff
@@ -88937,42 +88852,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v13, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v13, v25, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v23, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v23, v52, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v85.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v70.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v67
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v81.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v51, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v68.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v48
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v19, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v84.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v71.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v82
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v54
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v9
; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13
; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7
; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
@@ -88981,21 +88896,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v102.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -89011,13 +88927,13 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v8
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v99.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v100.l
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
@@ -89027,19 +88943,19 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v50.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v86.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v85.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v8.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v22
@@ -89050,15 +88966,15 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v98.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v83.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
@@ -89071,24 +88987,24 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v5
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v82.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v22
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v97.l
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
@@ -89105,16 +89021,16 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v13
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
@@ -89146,107 +89062,107 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[7:8]
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[5:6]
; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v13
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 24, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v12
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v11
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v7
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 24, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v4
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v4
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[11:12]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[3:4]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 24, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 24, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 24, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v1
; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow
; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4
@@ -89279,222 +89195,227 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v19
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_perm_b32 v27, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v21, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v21
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v22, v17, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v23, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v21, 16, 1
-; GFX11-FAKE16-NEXT: v_perm_b32 v26, v19, v17, 0x7060302
-; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v18, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v2, v1, 0x7060302
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v23, 16, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v19, v17, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v23, 16, 1
; GFX11-FAKE16-NEXT: v_add3_u32 v19, v19, v4, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v22, v24 :: v_dual_and_b32 v3, 0xffff0000, v3
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v27
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v26
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v19, v23 :: v_dual_lshlrev_b32 v23, 16, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v19, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v19, v26, v23, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v24 :: v_dual_lshlrev_b32 v24, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-FAKE16-NEXT: v_add3_u32 v19, v24, v21, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-FAKE16-NEXT: v_perm_b32 v29, v4, v18, 0x7060302
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v22 :: v_dual_lshlrev_b32 v22, 16, v5
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v23
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v3
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v22, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v4, v18, 0x7060302
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v20, v24, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v21, 16, 1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v6
-; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v22, 16, 1
-; GFX11-FAKE16-NEXT: v_perm_b32 v28, v3, v19, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v3, v19, 0x7060302
; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v20, v20, v21, 0x7fff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v23, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v20, v23 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v27, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v25, v22, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v24, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v6, v23, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v24
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v29, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v97, v3, v22, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v6, v27 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v28 :: v_dual_lshlrev_b32 v27, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v7
-; GFX11-FAKE16-NEXT: v_perm_b32 v86, v3, v20, 0x7060302
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v21, v25 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v23, v29 :: v_dual_and_b32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v6, 16, 1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-FAKE16-NEXT: v_perm_b32 v85, v5, v22, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v96, v5, v24, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v6, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v8
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v19
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v85
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v96
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v23, v28, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v25, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-FAKE16-NEXT: v_perm_b32 v83, v5, v6, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v23, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v23
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v85, v5, v6, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; GFX11-FAKE16-NEXT: v_add3_u32 v8, v30, v23, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v8, v24 :: v_dual_lshlrev_b32 v24, 16, v9
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v24
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v30, v27, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v8, v28 :: v_dual_lshlrev_b32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v29
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v24, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v28, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v8
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v7
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v7, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v21, v30, vcc_lo
-; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v8, 16, 1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_perm_b32 v82, v7, v23, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v84, v7, v27, 0x7060302
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v8, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v82
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v84
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v21, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v23, v29, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_add3_u32 v10, v31, v24, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v24
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v82
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v31, v28, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v30 :: v_dual_lshlrev_b32 v30, 16, v12
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_perm_b32 v70, v7, v8, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7
+; GFX11-FAKE16-NEXT: v_perm_b32 v81, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v10, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v10, v29, vcc_lo
; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v30
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v70
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v81
; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v12, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v12, 0x7fff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v9, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v9
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v9, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v31, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v11
-; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v23, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v10, 16, 1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT: v_perm_b32 v69, v9, v24, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v80, v9, v28, 0x7060302
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v21
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v25, v10, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v23
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v29, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v10
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v12
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v9, 16, 1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v69
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v21, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v80
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v23, v29, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v9
-; GFX11-FAKE16-NEXT: v_add3_u32 v21, v32, v9, 0x7fff
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 8, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v32, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 8, v80
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v14
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-FAKE16-NEXT: v_perm_b32 v55, v12, v10, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v25, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v65, v12, v10, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v23, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v30
; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v11, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v11
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v55
-; GFX11-FAKE16-NEXT: v_add3_u32 v25, v31, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 8, v65
+; GFX11-FAKE16-NEXT: v_add3_u32 v29, v31, v11, 0x7fff
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v25, v30, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v31
-; GFX11-FAKE16-NEXT: v_add3_u32 v30, v32, v21, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v29, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_add3_u32 v30, v32, v23, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v23
; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v14, 16, 1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v25
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v30, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v30, v31, vcc_lo
; GFX11-FAKE16-NEXT: v_add3_u32 v30, v32, v14, 0x7fff
; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v14
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT: v_add3_u32 v32, v33, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v32, v33, v29, 0x7fff
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v16
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v31, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v33
; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-FAKE16-NEXT: v_perm_b32 v54, v11, v9, 0x7060302
-; GFX11-FAKE16-NEXT: v_perm_b32 v50, v14, v21, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_perm_b32 v64, v11, v9, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v52, v14, v23, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15
; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v13, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v13
; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v30, 16, 1
@@ -89502,8 +89423,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-FAKE16-NEXT: v_add3_u32 v31, v35, v13, 0x7fff
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v24
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[84:85]
; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc_lo
; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v34
@@ -89512,191 +89433,190 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v16, 16, 1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v15, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v31
; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v15
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v51, v13, v29, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
; GFX11-FAKE16-NEXT: v_add3_u32 v33, v34, v16, 0x7fff
; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v16
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v15, 16, 1
; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v15, 0x7fff
-; GFX11-FAKE16-NEXT: v_perm_b32 v49, v13, v25, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[96:97]
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v15, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v23
+; GFX11-FAKE16-NEXT: v_perm_b32 v39, v16, v30, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v35, v36, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v20
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v49
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v30, v37, vcc_lo
-; GFX11-FAKE16-NEXT: v_perm_b32 v37, v16, v32, 0x7060302
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v16
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v32
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_perm_b32 v36, v15, v31, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[64:65]
+; GFX11-FAKE16-NEXT: v_perm_b32 v38, v15, v31, 0x7060302
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26]
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v31
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v21
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v37
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[36:37]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[49:50]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[54:55]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[20:21], 24, v[69:70]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[82:83]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[85:86]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[28:29]
-; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[26:27]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v37
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v36
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v36
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v50
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v49
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 24, v55
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v70
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v83
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v83
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 24, v86
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v86
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v29
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v28
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[80:81]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[20:21]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[38:39]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[51:52]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 24, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 24, v52
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 8, v52
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v51
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 8, v51
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 24, v65
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v64
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 24, v81
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v85
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v84
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v97
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v97
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v20
; GFX11-FAKE16-NEXT: .LBB108_4: ; %end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v99
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v30
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v24
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v81
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v20, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v83
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v98
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v97
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v26, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v87
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v29
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v86
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v21, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v70
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v30, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v96
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v84
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v28
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v65
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v81, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v28, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v31, 8, v84
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v28
; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v65, v68
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v30, v31
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v25
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v71
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v81
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v27
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v52
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v71
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v69
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v20, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v25, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v66
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v68
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v38
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v65
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v54
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v53
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v23
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v24
; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v25, v26
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v28, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v28, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v22
; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v33
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v53
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v52
; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v51
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v50
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v18
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v49
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19
; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v20, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v23, v18
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v24
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v39
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v37
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v34
; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v22
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v23, v17
@@ -89766,21 +89686,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v16
; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16
; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
@@ -89794,25 +89714,19 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
@@ -89868,213 +89782,219 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16
+; SI-NEXT: v_alignbit_b32 v6, v1, v29, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16
+; SI-NEXT: v_alignbit_b32 v3, v1, v32, 16
+; SI-NEXT: v_mov_b32_e32 v48, v12
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44
; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16
-; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16
+; SI-NEXT: v_mov_b32_e32 v51, v14
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT: v_alignbit_b32 v14, v12, v28, 16
+; SI-NEXT: v_alignbit_b32 v4, v14, v3, 24
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v14, v3, 16
+; SI-NEXT: v_alignbit_b32 v8, v7, v38, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
+; SI-NEXT: v_alignbit_b32 v4, v8, v2, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v4, v8, v2, 8
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v33
; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44
-; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16
-; SI-NEXT: v_mov_b32_e32 v31, v23
+; SI-NEXT: v_alignbit_b32 v5, v4, v45, 16
+; SI-NEXT: v_mov_b32_e32 v37, v32
+; SI-NEXT: v_mov_b32_e32 v32, v23
; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16
-; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16
; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24
-; SI-NEXT: v_mov_b32_e32 v38, v36
-; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24
-; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24
-; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16
-; SI-NEXT: v_mov_b32_e32 v53, v32
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16
-; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8
+; SI-NEXT: v_mov_b32_e32 v34, v29
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v23, v5, v1, 8
+; SI-NEXT: v_mov_b32_e32 v29, v26
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26
+; SI-NEXT: v_mov_b32_e32 v26, v42
+; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42
+; SI-NEXT: v_mov_b32_e32 v42, v33
+; SI-NEXT: v_mov_b32_e32 v36, v31
+; SI-NEXT: v_mov_b32_e32 v49, v35
+; SI-NEXT: v_mov_b32_e32 v52, v39
+; SI-NEXT: v_alignbit_b32 v53, v20, v6, 24
; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16
; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8
-; SI-NEXT: v_mov_b32_e32 v35, v29
-; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16
+; SI-NEXT: v_mov_b32_e32 v35, v28
; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8
-; SI-NEXT: v_mov_b32_e32 v37, v33
-; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v39, v38
+; SI-NEXT: v_mov_b32_e32 v50, v45
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v25, v22
; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22
; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v23, v41
; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41
; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19
-; SI-NEXT: v_mov_b32_e32 v28, v26
-; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26
; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16
-; SI-NEXT: v_mov_b32_e32 v26, v42
-; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42
; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11
-; SI-NEXT: v_mov_b32_e32 v29, v43
+; SI-NEXT: v_mov_b32_e32 v31, v43
; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43
; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20
-; SI-NEXT: v_mov_b32_e32 v34, v44
+; SI-NEXT: v_mov_b32_e32 v28, v44
; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44
; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14
-; SI-NEXT: v_mov_b32_e32 v33, v56
+; SI-NEXT: v_mov_b32_e32 v38, v56
; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56
; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48
-; SI-NEXT: v_mov_b32_e32 v48, v32
-; SI-NEXT: v_mov_b32_e32 v32, v50
-; SI-NEXT: v_mov_b32_e32 v50, v25
-; SI-NEXT: v_mov_b32_e32 v25, v36
-; SI-NEXT: v_mov_b32_e32 v36, v38
+; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42
; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5
; SI-NEXT: s_cbranch_execnz .LBB109_3
; SI-NEXT: .LBB109_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28
+; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48
; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45
-; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8
-; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43
-; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39
+; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36
; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16
-; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44
+; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8
+; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43
+; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42
+; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v53, v20, v6, 24
+; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16
+; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24
-; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16
-; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8
-; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17
-; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56
; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41
-; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8
; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16
-; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16
; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19
-; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24
-; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16
-; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26
+; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47
+; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16
+; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11
+; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47
+; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v29
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59
+; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56
+; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8
+; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16
; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24
; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30
@@ -90103,18 +90023,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47
-; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24
-; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11
-; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16
@@ -90123,15 +90033,30 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, v14, v3, 24
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, v14, v3, 16
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, v8, v2, 16
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, v8, v2, 8
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v22, v5, v1, 8
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59
; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45
; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44
@@ -90264,7 +90189,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_or_b32_e32 v6, v6, v9
; SI-NEXT: v_and_b32_e32 v9, 0xff, v55
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v53
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; SI-NEXT: v_or_b32_e32 v9, v10, v9
; SI-NEXT: v_or_b32_e32 v6, v6, v9
@@ -90285,10 +90210,14 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54
; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v52
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; SI-NEXT: v_or_b32_e32 v6, v9, v6
; SI-NEXT: v_or_b32_e32 v3, v3, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0
@@ -90305,15 +90234,19 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_or_b32_e32 v3, v3, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0
; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v32
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -90331,16 +90264,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
@@ -90379,39 +90314,56 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: v_mov_b32_e32 v53, v32
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: v_mov_b32_e32 v52, v39
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v49, v48
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: v_mov_b32_e32 v51, v14
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v37, v33
+; SI-NEXT: v_mov_b32_e32 v50, v45
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v33, v56
+; SI-NEXT: v_mov_b32_e32 v49, v35
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v35, v29
+; SI-NEXT: v_mov_b32_e32 v48, v12
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v34, v44
+; SI-NEXT: v_mov_b32_e32 v39, v38
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v31, v23
+; SI-NEXT: v_mov_b32_e32 v38, v56
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v29, v43
+; SI-NEXT: v_mov_b32_e32 v37, v32
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v28, v26
+; SI-NEXT: v_mov_b32_e32 v36, v31
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v26, v42
+; SI-NEXT: v_mov_b32_e32 v35, v28
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v23, v41
+; SI-NEXT: v_mov_b32_e32 v28, v44
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: v_mov_b32_e32 v34, v29
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: v_mov_b32_e32 v32, v23
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: v_mov_b32_e32 v31, v43
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: v_mov_b32_e32 v29, v26
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_mov_b32_e32 v26, v42
+; SI-NEXT: v_mov_b32_e32 v23, v41
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v25, v22
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr4
@@ -90439,33 +90391,28 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr4
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: s_branch .LBB109_2
;
@@ -91241,27 +91188,27 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_lshr_b32 s60, s22, 8
; GFX9-NEXT: s_lshr_b32 s48, s21, 24
; GFX9-NEXT: s_lshr_b32 s78, s21, 16
-; GFX9-NEXT: s_lshr_b32 s59, s21, 8
+; GFX9-NEXT: s_lshr_b32 s45, s21, 8
; GFX9-NEXT: s_lshr_b32 s49, s20, 16
-; GFX9-NEXT: s_lshr_b32 s58, s20, 8
+; GFX9-NEXT: s_lshr_b32 s44, s20, 8
; GFX9-NEXT: s_lshr_b32 s50, s19, 24
; GFX9-NEXT: s_lshr_b32 s77, s19, 16
-; GFX9-NEXT: s_lshr_b32 s57, s19, 8
+; GFX9-NEXT: s_lshr_b32 s13, s19, 8
; GFX9-NEXT: s_lshr_b32 s51, s18, 16
-; GFX9-NEXT: s_lshr_b32 s56, s18, 8
+; GFX9-NEXT: s_lshr_b32 s12, s18, 8
; GFX9-NEXT: s_lshr_b32 s52, s17, 24
; GFX9-NEXT: s_lshr_b32 s76, s17, 16
; GFX9-NEXT: s_lshr_b32 s53, s17, 8
; GFX9-NEXT: s_lshr_b32 s54, s16, 16
; GFX9-NEXT: s_lshr_b32 s55, s16, 8
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], 24
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[28:29], 24
+; GFX9-NEXT: s_lshr_b64 s[14:15], s[26:27], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[24:25], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[22:23], 24
+; GFX9-NEXT: s_lshr_b64 s[46:47], s[20:21], 24
+; GFX9-NEXT: s_lshr_b64 s[56:57], s[18:19], 24
+; GFX9-NEXT: s_lshr_b64 s[58:59], s[16:17], 24
; GFX9-NEXT: s_cbranch_execnz .LBB109_3
; GFX9-NEXT: .LBB109_2: ; %cmp.true
; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000
@@ -91290,357 +91237,357 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000
; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010
+; GFX9-NEXT: s_add_i32 s8, s8, s6
+; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff
+; GFX9-NEXT: s_bitset1_b32 s6, 22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s16, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s6, s6, s10
+; GFX9-NEXT: s_lshl_b32 s8, s16, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_lshr_b32 s6, s6, 16
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s16, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s16, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s19, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s19, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s77, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s19, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s19, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s19, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s18, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s18, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s18, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s12, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s18, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s12, s11
+; GFX9-NEXT: s_lshr_b32 s18, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s21, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s12, s18, s10
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s78, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s21, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s78, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s21, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s21, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s21, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s20, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s20, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s20, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s14, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s20, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s14, s11
+; GFX9-NEXT: s_lshr_b32 s20, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s23, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s44, s20, s10
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s79, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s23, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s79, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s23, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s23, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s23, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s22, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s22, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s22, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s14, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s22, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s14, s11
+; GFX9-NEXT: s_lshr_b32 s22, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s25, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s10
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s88, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s25, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s88, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s25, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s25, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s25, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s24, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s24, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s24, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s14, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s24, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s14, s11
+; GFX9-NEXT: s_lshr_b32 s24, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s27, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s10
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s89, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s27, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s89, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s27, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s27, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s27, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s26, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s26, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s26, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s14, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s26, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s14, s11
+; GFX9-NEXT: s_lshr_b32 s26, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s29, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s10
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s90, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s29, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s90, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s29, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s29, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s29, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s28, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_lshl_b32 s6, s28, 16
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s10, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_lshl_b32 s8, s28, 16
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_add_i32 s11, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s14, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s10, s9
-; GFX9-NEXT: s_lshr_b32 s28, s6, 16
-; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s14, s11
+; GFX9-NEXT: s_lshr_b32 s28, s8, 16
+; GFX9-NEXT: s_and_b32 s8, s5, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s10
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
; GFX9-NEXT: s_lshl_b32 s5, s5, 16
; GFX9-NEXT: v_add_f32_e32 v2, s5, v1
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: s_lshr_b32 s91, s6, 16
-; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010
-; GFX9-NEXT: s_add_i32 s6, s6, s5
-; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff
+; GFX9-NEXT: s_lshr_b32 s91, s8, 16
+; GFX9-NEXT: s_bfe_u32 s8, s5, 0x10010
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff
; GFX9-NEXT: s_bitset1_b32 s5, 22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s5, s5, s8
-; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000
-; GFX9-NEXT: v_add_f32_e32 v2, s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX9-NEXT: s_add_i32 s7, s7, s6
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s5, s5, s10
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff0000
+; GFX9-NEXT: v_add_f32_e32 v2, s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: s_bfe_u32 s9, s8, 0x10010
+; GFX9-NEXT: s_add_i32 s9, s9, s8
; GFX9-NEXT: s_lshr_b32 s5, s5, 16
-; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff
-; GFX9-NEXT: s_or_b32 s9, s6, 0x400000
+; GFX9-NEXT: s_add_i32 s10, s9, 0x7fff
+; GFX9-NEXT: s_or_b32 s11, s8, 0x400000
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s6, s9, s8
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s8, s11, s10
; GFX9-NEXT: s_lshl_b32 s4, s4, 16
; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_lshr_b32 s8, s6, 16
-; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010
-; GFX9-NEXT: s_add_i32 s6, s6, s4
-; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff
+; GFX9-NEXT: s_lshr_b32 s10, s8, 16
+; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010
+; GFX9-NEXT: s_add_i32 s8, s8, s4
+; GFX9-NEXT: s_add_i32 s11, s8, 0x7fff
; GFX9-NEXT: s_bitset1_b32 s4, 22
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX9-NEXT: s_cselect_b32 s4, s4, s9
+; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s4, s4, s11
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76
-; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77
-; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78
+; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s76
+; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
+; GFX9-NEXT: s_pack_ll_b32_b16 s13, s19, s77
+; GFX9-NEXT: s_pack_ll_b32_b16 s45, s21, s78
; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79
; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88
; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89
; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90
; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91
-; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24
-; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24
-; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24
-; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24
+; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s10
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[30:31], 24
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[74:75], 24
+; GFX9-NEXT: s_lshr_b64 s[14:15], s[72:73], 24
+; GFX9-NEXT: s_lshr_b64 s[40:41], s[62:63], 24
+; GFX9-NEXT: s_lshr_b64 s[42:43], s[60:61], 24
+; GFX9-NEXT: s_lshr_b64 s[46:47], s[44:45], 24
+; GFX9-NEXT: s_lshr_b64 s[56:57], s[12:13], 24
+; GFX9-NEXT: s_lshr_b64 s[58:59], s[6:7], 24
; GFX9-NEXT: s_lshr_b32 s92, s31, 24
; GFX9-NEXT: s_lshr_b32 s93, s31, 8
; GFX9-NEXT: s_lshr_b32 s94, s30, 16
@@ -91661,178 +91608,178 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_lshr_b32 s61, s61, 8
; GFX9-NEXT: s_lshr_b32 s39, s60, 16
; GFX9-NEXT: s_lshr_b32 s60, s60, 8
-; GFX9-NEXT: s_lshr_b32 s48, s59, 24
-; GFX9-NEXT: s_lshr_b32 s59, s59, 8
-; GFX9-NEXT: s_lshr_b32 s49, s58, 16
-; GFX9-NEXT: s_lshr_b32 s58, s58, 8
-; GFX9-NEXT: s_lshr_b32 s50, s57, 24
-; GFX9-NEXT: s_lshr_b32 s57, s57, 8
-; GFX9-NEXT: s_lshr_b32 s51, s56, 16
-; GFX9-NEXT: s_lshr_b32 s56, s56, 8
-; GFX9-NEXT: s_lshr_b32 s52, s47, 24
-; GFX9-NEXT: s_lshr_b32 s53, s47, 8
-; GFX9-NEXT: s_lshr_b32 s54, s46, 16
-; GFX9-NEXT: s_lshr_b32 s55, s46, 8
+; GFX9-NEXT: s_lshr_b32 s48, s45, 24
+; GFX9-NEXT: s_lshr_b32 s45, s45, 8
+; GFX9-NEXT: s_lshr_b32 s49, s44, 16
+; GFX9-NEXT: s_lshr_b32 s44, s44, 8
+; GFX9-NEXT: s_lshr_b32 s50, s13, 24
+; GFX9-NEXT: s_lshr_b32 s13, s13, 8
+; GFX9-NEXT: s_lshr_b32 s51, s12, 16
+; GFX9-NEXT: s_lshr_b32 s12, s12, 8
+; GFX9-NEXT: s_lshr_b32 s52, s7, 24
+; GFX9-NEXT: s_lshr_b32 s53, s7, 8
+; GFX9-NEXT: s_lshr_b32 s54, s6, 16
+; GFX9-NEXT: s_lshr_b32 s55, s6, 8
; GFX9-NEXT: .LBB109_3: ; %end
-; GFX9-NEXT: s_and_b32 s7, s16, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s55, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s54, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s44, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s17, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s53, 8
+; GFX9-NEXT: s_and_b32 s6, s16, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s55, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s54, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s58, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s76, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s52, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s17, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s53, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s76, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s52, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s18, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s18, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s12, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s51, 0xff
; GFX9-NEXT: s_lshl_b32 s9, s56, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s51, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s42, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s19, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s57, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s77, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s50, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s19, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s13, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s77, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s50, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s20, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s58, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s49, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s40, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s20, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s44, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s49, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s46, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s21, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s59, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s78, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s48, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s21, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s45, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s78, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s48, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s22, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s60, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s39, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s14, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s22, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s60, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s39, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s42, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s23, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s61, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s79, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s38, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s23, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s61, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s79, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s38, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s24, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s62, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s37, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s12, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s24, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s62, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s37, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s40, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s25, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s63, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s88, 0xff
-; GFX9-NEXT: s_lshl_b32 s11, s36, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s11
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s25, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s63, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s88, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s36, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s26, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s72, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s35, 0xff
-; GFX9-NEXT: s_lshl_b32 s10, s10, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s26, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s72, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s35, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s14, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s27, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s73, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s89, 0xff
-; GFX9-NEXT: s_lshl_b32 s10, s34, 8
-; GFX9-NEXT: s_or_b32 s9, s9, s10
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s9, s9, 16
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s27, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s73, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s89, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s34, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s28, 0xff
-; GFX9-NEXT: s_lshl_b32 s9, s74, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s28, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s74, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s31, 0xff
+; GFX9-NEXT: s_lshl_b32 s9, s10, 8
; GFX9-NEXT: s_or_b32 s7, s7, s9
-; GFX9-NEXT: s_and_b32 s9, s31, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 8
-; GFX9-NEXT: s_or_b32 s8, s9, s8
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s7, s29, 0xff
-; GFX9-NEXT: s_lshl_b32 s8, s75, 8
-; GFX9-NEXT: s_or_b32 s7, s7, s8
-; GFX9-NEXT: s_and_b32 s8, s90, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s6, s29, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s75, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
+; GFX9-NEXT: s_and_b32 s7, s90, 0xff
; GFX9-NEXT: s_lshl_b32 s9, s30, 8
-; GFX9-NEXT: s_or_b32 s8, s8, s9
-; GFX9-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX9-NEXT: s_lshl_b32 s8, s8, 16
-; GFX9-NEXT: s_or_b32 s7, s7, s8
+; GFX9-NEXT: s_or_b32 s7, s7, s9
+; GFX9-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX9-NEXT: s_lshl_b32 s7, s7, 16
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: s_and_b32 s4, s4, 0xff
-; GFX9-NEXT: s_lshl_b32 s7, s95, 8
-; GFX9-NEXT: s_or_b32 s4, s4, s7
-; GFX9-NEXT: s_and_b32 s7, s94, 0xff
-; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: s_or_b32 s6, s7, s6
+; GFX9-NEXT: s_lshl_b32 s6, s95, 8
+; GFX9-NEXT: s_or_b32 s4, s4, s6
+; GFX9-NEXT: s_and_b32 s6, s94, 0xff
+; GFX9-NEXT: s_lshl_b32 s7, s8, 8
+; GFX9-NEXT: s_or_b32 s6, s6, s7
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s4, s4, s6
@@ -91874,49 +91821,49 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: .LBB109_4:
; GFX9-NEXT: ; implicit-def: $sgpr55
; GFX9-NEXT: ; implicit-def: $sgpr54
-; GFX9-NEXT: ; implicit-def: $sgpr44
+; GFX9-NEXT: ; implicit-def: $sgpr58
; GFX9-NEXT: ; implicit-def: $sgpr53
; GFX9-NEXT: ; implicit-def: $sgpr76
; GFX9-NEXT: ; implicit-def: $sgpr52
-; GFX9-NEXT: ; implicit-def: $sgpr56
+; GFX9-NEXT: ; implicit-def: $sgpr12
; GFX9-NEXT: ; implicit-def: $sgpr51
-; GFX9-NEXT: ; implicit-def: $sgpr42
-; GFX9-NEXT: ; implicit-def: $sgpr57
+; GFX9-NEXT: ; implicit-def: $sgpr56
+; GFX9-NEXT: ; implicit-def: $sgpr13
; GFX9-NEXT: ; implicit-def: $sgpr77
; GFX9-NEXT: ; implicit-def: $sgpr50
-; GFX9-NEXT: ; implicit-def: $sgpr58
+; GFX9-NEXT: ; implicit-def: $sgpr44
; GFX9-NEXT: ; implicit-def: $sgpr49
-; GFX9-NEXT: ; implicit-def: $sgpr40
-; GFX9-NEXT: ; implicit-def: $sgpr59
+; GFX9-NEXT: ; implicit-def: $sgpr46
+; GFX9-NEXT: ; implicit-def: $sgpr45
; GFX9-NEXT: ; implicit-def: $sgpr78
; GFX9-NEXT: ; implicit-def: $sgpr48
; GFX9-NEXT: ; implicit-def: $sgpr60
; GFX9-NEXT: ; implicit-def: $sgpr39
-; GFX9-NEXT: ; implicit-def: $sgpr14
+; GFX9-NEXT: ; implicit-def: $sgpr42
; GFX9-NEXT: ; implicit-def: $sgpr61
; GFX9-NEXT: ; implicit-def: $sgpr79
; GFX9-NEXT: ; implicit-def: $sgpr38
; GFX9-NEXT: ; implicit-def: $sgpr62
; GFX9-NEXT: ; implicit-def: $sgpr37
-; GFX9-NEXT: ; implicit-def: $sgpr12
+; GFX9-NEXT: ; implicit-def: $sgpr40
; GFX9-NEXT: ; implicit-def: $sgpr63
; GFX9-NEXT: ; implicit-def: $sgpr88
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr72
; GFX9-NEXT: ; implicit-def: $sgpr35
-; GFX9-NEXT: ; implicit-def: $sgpr10
+; GFX9-NEXT: ; implicit-def: $sgpr14
; GFX9-NEXT: ; implicit-def: $sgpr73
; GFX9-NEXT: ; implicit-def: $sgpr89
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr74
; GFX9-NEXT: ; implicit-def: $sgpr31
-; GFX9-NEXT: ; implicit-def: $sgpr8
+; GFX9-NEXT: ; implicit-def: $sgpr10
; GFX9-NEXT: ; implicit-def: $sgpr75
; GFX9-NEXT: ; implicit-def: $sgpr90
; GFX9-NEXT: ; implicit-def: $sgpr30
; GFX9-NEXT: ; implicit-def: $sgpr95
; GFX9-NEXT: ; implicit-def: $sgpr94
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: ; implicit-def: $sgpr8
; GFX9-NEXT: ; implicit-def: $sgpr93
; GFX9-NEXT: ; implicit-def: $sgpr91
; GFX9-NEXT: ; implicit-def: $sgpr92
@@ -91930,7 +91877,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3
@@ -91940,6 +91887,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8
; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s50, 10
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 24
@@ -91981,7 +91929,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8
; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[26:27], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[24:25], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[22:23], 24
@@ -91990,7 +91938,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24
; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_3
; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000
@@ -92433,10 +92381,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], vcc, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, vcc_lo, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, vcc_lo, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s42, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s42, 8
; GFX11-TRUE16-NEXT: .LBB109_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s42
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s50
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s49
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40
; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
@@ -92652,6 +92600,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v17, 10
; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9
; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8
; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7
@@ -92668,7 +92617,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB109_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16
@@ -92726,7 +92675,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3
@@ -92735,8 +92684,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7
; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8
-; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s50, 9
-; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s51, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s49, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s50, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s51, 11
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 24
@@ -92778,7 +92728,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8
; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[24:25], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[22:23], 24
@@ -92787,7 +92737,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24
; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_3
; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000
@@ -93230,10 +93180,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[50:51], 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s50, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s50, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s42, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s42, 8
; GFX11-FAKE16-NEXT: .LBB109_3: ; %end
; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s42, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s49, 8
; GFX11-FAKE16-NEXT: s_and_b32 s7, s48, 0xff
; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8
; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5
@@ -93389,8 +93339,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v17, 10
-; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v17, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v17, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v17, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v17, 9
; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8
; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7
; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6
@@ -93406,7 +93357,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB109_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
@@ -95984,12 +95935,12 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68
; SI-NEXT: v_readfirstlane_b32 s46, v30
-; SI-NEXT: v_readfirstlane_b32 s44, v23
-; SI-NEXT: v_readfirstlane_b32 s45, v22
-; SI-NEXT: v_readfirstlane_b32 s41, v15
-; SI-NEXT: v_readfirstlane_b32 s43, v14
-; SI-NEXT: v_readfirstlane_b32 s10, v7
-; SI-NEXT: v_readfirstlane_b32 s12, v6
+; SI-NEXT: v_readfirstlane_b32 s41, v23
+; SI-NEXT: v_readfirstlane_b32 s43, v22
+; SI-NEXT: v_readfirstlane_b32 s10, v15
+; SI-NEXT: v_readfirstlane_b32 s12, v14
+; SI-NEXT: v_readfirstlane_b32 s8, v7
+; SI-NEXT: v_readfirstlane_b32 s9, v6
; SI-NEXT: v_readfirstlane_b32 s7, v1
; SI-NEXT: v_readfirstlane_b32 s6, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5
@@ -96022,47 +95973,47 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: s_and_b32 s4, s16, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s17, 24
-; SI-NEXT: s_or_b32 s8, s5, s4
+; SI-NEXT: s_or_b32 s11, s5, s4
; SI-NEXT: s_and_b32 s4, s18, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s19, 24
-; SI-NEXT: s_or_b32 s9, s5, s4
+; SI-NEXT: s_or_b32 s13, s5, s4
; SI-NEXT: s_and_b32 s4, s20, 0xff
; SI-NEXT: s_lshl_b32 s5, s21, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_lshl_b32 s11, s4, 16
+; SI-NEXT: s_lshl_b32 s14, s4, 16
; SI-NEXT: s_and_b32 s4, s22, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s23, 24
-; SI-NEXT: s_or_b32 s13, s5, s4
+; SI-NEXT: s_or_b32 s15, s5, s4
; SI-NEXT: s_and_b32 s4, s24, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s25, 24
-; SI-NEXT: s_or_b32 s14, s5, s4
+; SI-NEXT: s_or_b32 s40, s5, s4
; SI-NEXT: s_and_b32 s4, s26, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s27, 24
-; SI-NEXT: s_or_b32 s15, s5, s4
+; SI-NEXT: s_or_b32 s42, s5, s4
; SI-NEXT: s_and_b32 s4, s28, 0xff
; SI-NEXT: s_lshl_b32 s5, s29, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: s_lshl_b32 s40, s4, 16
+; SI-NEXT: s_lshl_b32 s44, s4, 16
; SI-NEXT: s_and_b32 s4, s6, 0xff
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s7, 24
-; SI-NEXT: s_or_b32 s42, s5, s4
-; SI-NEXT: s_and_b32 s4, s12, 0xff
-; SI-NEXT: s_lshl_b32 s5, s10, 8
+; SI-NEXT: s_or_b32 s45, s5, s4
+; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s5, s8, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_lshl_b32 s60, s4, 16
-; SI-NEXT: s_and_b32 s4, s43, 0xff
-; SI-NEXT: s_lshl_b32 s5, s41, 8
+; SI-NEXT: s_and_b32 s4, s12, 0xff
+; SI-NEXT: s_lshl_b32 s5, s10, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_and_b32_e32 v9, 0xff, v2
; SI-NEXT: s_lshl_b32 s61, s4, 16
; SI-NEXT: v_and_b32_e32 v17, 0xff, v18
-; SI-NEXT: s_and_b32 s4, s45, 0xff
-; SI-NEXT: s_lshl_b32 s5, s44, 8
+; SI-NEXT: s_and_b32 s4, s43, 0xff
+; SI-NEXT: s_lshl_b32 s5, s41, 8
; SI-NEXT: v_and_b32_e32 v25, 0xff, v52
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v3
@@ -96210,10 +96161,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: v_or_b32_e32 v13, v22, v13
; SI-NEXT: v_or_b32_e32 v9, v13, v9
-; SI-NEXT: s_add_i32 s45, s45, 3
+; SI-NEXT: s_add_i32 s43, s43, 3
; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v9
-; SI-NEXT: s_and_b32 s4, s45, 0xff
-; SI-NEXT: s_lshl_b32 s5, s44, 8
+; SI-NEXT: s_and_b32 s4, s43, 0xff
+; SI-NEXT: s_lshl_b32 s5, s41, 8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
@@ -96233,10 +96184,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: v_or_b32_e32 v7, v7, v13
-; SI-NEXT: s_add_i32 s43, s43, 3
+; SI-NEXT: s_add_i32 s12, s12, 3
; SI-NEXT: v_or_b32_e32 v7, v7, v9
-; SI-NEXT: s_and_b32 s4, s43, 0xff
-; SI-NEXT: s_lshl_b32 s5, s41, 8
+; SI-NEXT: s_and_b32 s4, s12, 0xff
+; SI-NEXT: s_lshl_b32 s5, s10, 8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
@@ -96244,10 +96195,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: s_add_i32 s12, s12, 3
+; SI-NEXT: s_add_i32 s9, s9, 3
; SI-NEXT: v_or_b32_e32 v6, s4, v6
-; SI-NEXT: s_and_b32 s4, s12, 0xff
-; SI-NEXT: s_lshl_b32 s5, s10, 8
+; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s5, s8, 8
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
@@ -96335,14 +96286,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: s_add_i32 s5, s5, 0x3000000
; SI-NEXT: s_add_i32 s6, s6, 0x3000000
; SI-NEXT: s_add_i32 s7, s7, 0x3000000
-; SI-NEXT: s_and_b32 s9, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s8, s7, 16
-; SI-NEXT: s_and_b32 s13, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s11, s6, 16
-; SI-NEXT: s_and_b32 s15, s5, 0xffff0000
-; SI-NEXT: s_lshl_b32 s14, s5, 16
-; SI-NEXT: s_and_b32 s42, s4, 0xffff0000
-; SI-NEXT: s_lshl_b32 s40, s4, 16
+; SI-NEXT: s_and_b32 s13, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s7, 16
+; SI-NEXT: s_and_b32 s15, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s14, s6, 16
+; SI-NEXT: s_and_b32 s42, s5, 0xffff0000
+; SI-NEXT: s_lshl_b32 s40, s5, 16
+; SI-NEXT: s_and_b32 s45, s4, 0xffff0000
+; SI-NEXT: s_lshl_b32 s44, s4, 16
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0
; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v1
@@ -96369,14 +96320,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43
; SI-NEXT: s_branch .LBB111_5
; SI-NEXT: .LBB111_3:
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr9
; SI-NEXT: ; implicit-def: $sgpr11
; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr15
; SI-NEXT: ; implicit-def: $sgpr40
; SI-NEXT: ; implicit-def: $sgpr42
+; SI-NEXT: ; implicit-def: $sgpr44
+; SI-NEXT: ; implicit-def: $sgpr45
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $sgpr60
@@ -96416,14 +96367,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v0, s8
-; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: v_mov_b32_e32 v2, s11
-; SI-NEXT: v_mov_b32_e32 v3, s13
-; SI-NEXT: v_mov_b32_e32 v4, s14
-; SI-NEXT: v_mov_b32_e32 v5, s15
-; SI-NEXT: v_mov_b32_e32 v6, s40
-; SI-NEXT: v_mov_b32_e32 v7, s42
+; SI-NEXT: v_mov_b32_e32 v0, s11
+; SI-NEXT: v_mov_b32_e32 v1, s13
+; SI-NEXT: v_mov_b32_e32 v2, s14
+; SI-NEXT: v_mov_b32_e32 v3, s15
+; SI-NEXT: v_mov_b32_e32 v4, s40
+; SI-NEXT: v_mov_b32_e32 v5, s42
+; SI-NEXT: v_mov_b32_e32 v6, s44
+; SI-NEXT: v_mov_b32_e32 v7, s45
; SI-NEXT: v_mov_b32_e32 v8, v37
; SI-NEXT: v_mov_b32_e32 v11, v38
; SI-NEXT: v_mov_b32_e32 v12, v48
@@ -96515,11 +96466,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; VI-NEXT: s_waitcnt vmcnt(11)
; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
; VI-NEXT: s_waitcnt vmcnt(9)
-; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39
+; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v39
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49
+; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v49
; VI-NEXT: s_cbranch_scc0 .LBB111_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -96578,10 +96529,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, s4, v0
@@ -96641,9 +96592,9 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24
; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45
; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3
; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55
; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -96700,8 +96651,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; VI-NEXT: s_addk_i32 s6, 0x300
; VI-NEXT: s_addk_i32 s8, 0x300
; VI-NEXT: s_addk_i32 s10, 0x300
+; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57
; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45
; VI-NEXT: s_addk_i32 s4, 0x300
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_lshl_b32 s7, s7, 16
@@ -96709,8 +96660,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; VI-NEXT: s_and_b32 s10, s10, 0xffff
; VI-NEXT: s_and_b32 s8, s8, 0xffff
; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: s_or_b32 s9, s9, s10
; VI-NEXT: s_or_b32 s7, s7, s8
; VI-NEXT: s_or_b32 s5, s5, s6
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 3a26a5c263d78..70a4fdce8d9a0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -3264,7 +3264,7 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, 0
; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -3283,10 +3283,10 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s43
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3
@@ -3322,9 +3322,9 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14
@@ -3338,16 +3338,18 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41
@@ -3365,9 +3367,9 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
@@ -6339,7 +6341,7 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, 0
; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -6358,10 +6360,10 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s43
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3
@@ -6397,9 +6399,9 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14
@@ -6413,16 +6415,18 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41
@@ -6440,9 +6444,9 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
@@ -17770,7 +17774,7 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, 0
; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -17789,10 +17793,10 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s43
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3
@@ -17828,9 +17832,9 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14
@@ -17844,16 +17848,18 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41
@@ -17871,9 +17877,9 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
@@ -20855,7 +20861,7 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, 0
; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -20874,10 +20880,10 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s43
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3
@@ -20913,9 +20919,9 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14
@@ -20929,16 +20935,18 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_mov_b32 s43, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41
@@ -20956,9 +20964,9 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index cc55ba1d84df6..9d2b0df4280e7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3400,7 +3400,7 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, 0
; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -3421,10 +3421,10 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s47
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -3464,24 +3464,26 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s58
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s47
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45
@@ -3506,9 +3508,9 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
@@ -6774,7 +6776,7 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, 0
; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -6795,10 +6797,10 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s47
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -6838,24 +6840,26 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s58
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s47
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45
@@ -6880,9 +6884,9 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
@@ -8025,7 +8029,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v62, v2
; SI-NEXT: v_cvt_f16_f32_e32 v44, v5
; SI-NEXT: v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: v_cvt_f16_f32_e32 v41, v6
; SI-NEXT: v_cvt_f16_f32_e32 v40, v9
; SI-NEXT: v_cvt_f16_f32_e32 v55, v8
@@ -8045,26 +8049,26 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v20, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v60, s16
-; SI-NEXT: v_cvt_f16_f32_e32 v59, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v57, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v61, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v57, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v60, s19
+; SI-NEXT: v_cvt_f16_f32_e32 v58, s18
; SI-NEXT: v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v56, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v42, s20
; SI-NEXT: v_cvt_f16_f32_e32 v38, s23
; SI-NEXT: v_cvt_f16_f32_e32 v37, s22
; SI-NEXT: v_cvt_f16_f32_e32 v36, s25
; SI-NEXT: v_cvt_f16_f32_e32 v35, s24
; SI-NEXT: v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v61, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v33, s26
; SI-NEXT: v_cvt_f16_f32_e32 v25, s29
; SI-NEXT: v_cvt_f16_f32_e32 v24, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36
@@ -8073,7 +8077,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52
@@ -8083,12 +8087,12 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT: v_or_b32_e32 v0, v60, v0
-; SI-NEXT: v_or_b32_e32 v1, v57, v1
-; SI-NEXT: v_or_b32_e32 v2, v56, v2
+; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v1, v58, v1
+; SI-NEXT: v_or_b32_e32 v2, v42, v2
; SI-NEXT: v_or_b32_e32 v3, v37, v3
; SI-NEXT: v_or_b32_e32 v4, v35, v4
-; SI-NEXT: v_or_b32_e32 v5, v61, v5
+; SI-NEXT: v_or_b32_e32 v5, v33, v5
; SI-NEXT: v_or_b32_e32 v6, v24, v6
; SI-NEXT: v_or_b32_e32 v7, v47, v7
; SI-NEXT: v_or_b32_e32 v8, v62, v8
@@ -8105,11 +8109,12 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v19, v21, v19
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v58
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v61
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v58
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
@@ -8123,7 +8128,8 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v56
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v42
; SI-NEXT: v_cvt_f32_f16_e32 v4, v37
; SI-NEXT: v_cvt_f32_f16_e32 v5, v35
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -8137,7 +8143,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v38
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v33
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v7, v24
@@ -8164,7 +8170,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v56
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v25
@@ -8288,20 +8294,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB19_4:
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v59, v48
+; SI-NEXT: v_mov_b32_e32 v42, v48
; SI-NEXT: v_mov_b32_e32 v48, v21
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v49
+; SI-NEXT: v_mov_b32_e32 v58, v49
; SI-NEXT: v_mov_b32_e32 v49, v20
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v43, v50
; SI-NEXT: v_mov_b32_e32 v50, v22
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v44, v51
; SI-NEXT: v_mov_b32_e32 v51, v23
@@ -8309,22 +8315,21 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v52, v27
; SI-NEXT: v_mov_b32_e32 v46, v53
; SI-NEXT: v_mov_b32_e32 v53, v28
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v60, v33
; SI-NEXT: v_mov_b32_e32 v41, v32
; SI-NEXT: v_mov_b32_e32 v33, v47
; SI-NEXT: v_mov_b32_e32 v47, v54
; SI-NEXT: v_mov_b32_e32 v54, v29
-; SI-NEXT: v_mov_b32_e32 v42, v56
; SI-NEXT: v_mov_b32_e32 v56, v55
; SI-NEXT: v_mov_b32_e32 v55, v30
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v36, v57
+; SI-NEXT: v_mov_b32_e32 v59, v57
; SI-NEXT: v_mov_b32_e32 v57, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v39, v58
-; SI-NEXT: v_mov_b32_e32 v58, v37
+; SI-NEXT: v_mov_b32_e32 v36, v39
+; SI-NEXT: v_mov_b32_e32 v39, v37
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: v_mov_b32_e32 v34, v24
; SI-NEXT: v_mov_b32_e32 v32, v38
@@ -8336,34 +8341,34 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v38, v32
; SI-NEXT: v_mov_b32_e32 v24, v34
; SI-NEXT: v_mov_b32_e32 v34, v37
-; SI-NEXT: v_mov_b32_e32 v37, v58
-; SI-NEXT: v_mov_b32_e32 v58, v39
-; SI-NEXT: v_mov_b32_e32 v31, v40
-; SI-NEXT: v_mov_b32_e32 v40, v57
-; SI-NEXT: v_mov_b32_e32 v57, v36
+; SI-NEXT: v_mov_b32_e32 v37, v39
+; SI-NEXT: v_mov_b32_e32 v39, v36
; SI-NEXT: v_mov_b32_e32 v30, v55
; SI-NEXT: v_mov_b32_e32 v55, v56
-; SI-NEXT: v_mov_b32_e32 v56, v42
+; SI-NEXT: v_mov_b32_e32 v29, v54
+; SI-NEXT: v_mov_b32_e32 v54, v47
+; SI-NEXT: v_mov_b32_e32 v47, v33
; SI-NEXT: v_mov_b32_e32 v32, v41
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v60
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v23, v51
; SI-NEXT: v_mov_b32_e32 v51, v44
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v50
; SI-NEXT: v_mov_b32_e32 v50, v43
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v20, v49
-; SI-NEXT: v_mov_b32_e32 v49, v60
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v49, v58
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v21, v48
-; SI-NEXT: v_mov_b32_e32 v48, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v48, v42
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v29, v54
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v47, v33
+; SI-NEXT: v_mov_b32_e32 v31, v40
+; SI-NEXT: v_mov_b32_e32 v40, v57
+; SI-NEXT: v_mov_b32_e32 v57, v59
; SI-NEXT: v_mov_b32_e32 v28, v53
; SI-NEXT: v_mov_b32_e32 v53, v46
; SI-NEXT: v_mov_b32_e32 v27, v52
@@ -16112,7 +16117,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v62, v2
; SI-NEXT: v_cvt_f16_f32_e32 v44, v5
; SI-NEXT: v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: v_cvt_f16_f32_e32 v41, v6
; SI-NEXT: v_cvt_f16_f32_e32 v40, v9
; SI-NEXT: v_cvt_f16_f32_e32 v55, v8
@@ -16132,26 +16137,26 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v20, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v60, s16
-; SI-NEXT: v_cvt_f16_f32_e32 v59, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v57, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v61, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v57, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v60, s19
+; SI-NEXT: v_cvt_f16_f32_e32 v58, s18
; SI-NEXT: v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v56, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v42, s20
; SI-NEXT: v_cvt_f16_f32_e32 v38, s23
; SI-NEXT: v_cvt_f16_f32_e32 v37, s22
; SI-NEXT: v_cvt_f16_f32_e32 v36, s25
; SI-NEXT: v_cvt_f16_f32_e32 v35, s24
; SI-NEXT: v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v61, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v33, s26
; SI-NEXT: v_cvt_f16_f32_e32 v25, s29
; SI-NEXT: v_cvt_f16_f32_e32 v24, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36
@@ -16160,7 +16165,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52
@@ -16170,12 +16175,12 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT: v_or_b32_e32 v0, v60, v0
-; SI-NEXT: v_or_b32_e32 v1, v57, v1
-; SI-NEXT: v_or_b32_e32 v2, v56, v2
+; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v1, v58, v1
+; SI-NEXT: v_or_b32_e32 v2, v42, v2
; SI-NEXT: v_or_b32_e32 v3, v37, v3
; SI-NEXT: v_or_b32_e32 v4, v35, v4
-; SI-NEXT: v_or_b32_e32 v5, v61, v5
+; SI-NEXT: v_or_b32_e32 v5, v33, v5
; SI-NEXT: v_or_b32_e32 v6, v24, v6
; SI-NEXT: v_or_b32_e32 v7, v47, v7
; SI-NEXT: v_or_b32_e32 v8, v62, v8
@@ -16192,11 +16197,12 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v19, v21, v19
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v58
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v61
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v58
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
@@ -16210,7 +16216,8 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v56
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v42
; SI-NEXT: v_cvt_f32_f16_e32 v4, v37
; SI-NEXT: v_cvt_f32_f16_e32 v5, v35
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -16224,7 +16231,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v3, v38
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v33
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v7, v24
@@ -16251,7 +16258,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v56
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v25
@@ -16375,20 +16382,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB35_4:
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v59, v48
+; SI-NEXT: v_mov_b32_e32 v42, v48
; SI-NEXT: v_mov_b32_e32 v48, v21
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v49
+; SI-NEXT: v_mov_b32_e32 v58, v49
; SI-NEXT: v_mov_b32_e32 v49, v20
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v43, v50
; SI-NEXT: v_mov_b32_e32 v50, v22
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v44, v51
; SI-NEXT: v_mov_b32_e32 v51, v23
@@ -16396,22 +16403,21 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v52, v27
; SI-NEXT: v_mov_b32_e32 v46, v53
; SI-NEXT: v_mov_b32_e32 v53, v28
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v60, v33
; SI-NEXT: v_mov_b32_e32 v41, v32
; SI-NEXT: v_mov_b32_e32 v33, v47
; SI-NEXT: v_mov_b32_e32 v47, v54
; SI-NEXT: v_mov_b32_e32 v54, v29
-; SI-NEXT: v_mov_b32_e32 v42, v56
; SI-NEXT: v_mov_b32_e32 v56, v55
; SI-NEXT: v_mov_b32_e32 v55, v30
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v36, v57
+; SI-NEXT: v_mov_b32_e32 v59, v57
; SI-NEXT: v_mov_b32_e32 v57, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v39, v58
-; SI-NEXT: v_mov_b32_e32 v58, v37
+; SI-NEXT: v_mov_b32_e32 v36, v39
+; SI-NEXT: v_mov_b32_e32 v39, v37
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: v_mov_b32_e32 v34, v24
; SI-NEXT: v_mov_b32_e32 v32, v38
@@ -16423,34 +16429,34 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v38, v32
; SI-NEXT: v_mov_b32_e32 v24, v34
; SI-NEXT: v_mov_b32_e32 v34, v37
-; SI-NEXT: v_mov_b32_e32 v37, v58
-; SI-NEXT: v_mov_b32_e32 v58, v39
-; SI-NEXT: v_mov_b32_e32 v31, v40
-; SI-NEXT: v_mov_b32_e32 v40, v57
-; SI-NEXT: v_mov_b32_e32 v57, v36
+; SI-NEXT: v_mov_b32_e32 v37, v39
+; SI-NEXT: v_mov_b32_e32 v39, v36
; SI-NEXT: v_mov_b32_e32 v30, v55
; SI-NEXT: v_mov_b32_e32 v55, v56
-; SI-NEXT: v_mov_b32_e32 v56, v42
+; SI-NEXT: v_mov_b32_e32 v29, v54
+; SI-NEXT: v_mov_b32_e32 v54, v47
+; SI-NEXT: v_mov_b32_e32 v47, v33
; SI-NEXT: v_mov_b32_e32 v32, v41
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v60
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v23, v51
; SI-NEXT: v_mov_b32_e32 v51, v44
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v50
; SI-NEXT: v_mov_b32_e32 v50, v43
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v20, v49
-; SI-NEXT: v_mov_b32_e32 v49, v60
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v49, v58
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v21, v48
-; SI-NEXT: v_mov_b32_e32 v48, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v48, v42
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v29, v54
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v47, v33
+; SI-NEXT: v_mov_b32_e32 v31, v40
+; SI-NEXT: v_mov_b32_e32 v40, v57
+; SI-NEXT: v_mov_b32_e32 v57, v59
; SI-NEXT: v_mov_b32_e32 v28, v53
; SI-NEXT: v_mov_b32_e32 v53, v46
; SI-NEXT: v_mov_b32_e32 v27, v52
@@ -18910,7 +18916,7 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, 0
; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -18931,10 +18937,10 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s47
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s5, s5, 3
@@ -18974,24 +18980,26 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s58
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s47
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45
@@ -19016,9 +19024,9 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
@@ -22294,7 +22302,7 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, 0
; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -22315,10 +22323,10 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s47
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s5, s5, 3
@@ -22358,24 +22366,26 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s58
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s47
+; GFX11-TRUE16-NEXT: s_mov_b32 s47, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45
@@ -22400,9 +22410,9 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16
@@ -23545,7 +23555,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v62, v2
; SI-NEXT: v_cvt_f16_f32_e32 v44, v5
; SI-NEXT: v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: v_cvt_f16_f32_e32 v41, v6
; SI-NEXT: v_cvt_f16_f32_e32 v40, v9
; SI-NEXT: v_cvt_f16_f32_e32 v55, v8
@@ -23565,26 +23575,26 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v20, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v60, s16
-; SI-NEXT: v_cvt_f16_f32_e32 v59, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v57, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v61, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v57, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v60, s19
+; SI-NEXT: v_cvt_f16_f32_e32 v58, s18
; SI-NEXT: v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v56, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v42, s20
; SI-NEXT: v_cvt_f16_f32_e32 v38, s23
; SI-NEXT: v_cvt_f16_f32_e32 v37, s22
; SI-NEXT: v_cvt_f16_f32_e32 v36, s25
; SI-NEXT: v_cvt_f16_f32_e32 v35, s24
; SI-NEXT: v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v61, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v33, s26
; SI-NEXT: v_cvt_f16_f32_e32 v25, s29
; SI-NEXT: v_cvt_f16_f32_e32 v24, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36
@@ -23593,7 +23603,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52
@@ -23603,12 +23613,12 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT: v_or_b32_e32 v0, v60, v0
-; SI-NEXT: v_or_b32_e32 v1, v57, v1
-; SI-NEXT: v_or_b32_e32 v2, v56, v2
+; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v1, v58, v1
+; SI-NEXT: v_or_b32_e32 v2, v42, v2
; SI-NEXT: v_or_b32_e32 v3, v37, v3
; SI-NEXT: v_or_b32_e32 v4, v35, v4
-; SI-NEXT: v_or_b32_e32 v5, v61, v5
+; SI-NEXT: v_or_b32_e32 v5, v33, v5
; SI-NEXT: v_or_b32_e32 v6, v24, v6
; SI-NEXT: v_or_b32_e32 v7, v47, v7
; SI-NEXT: v_or_b32_e32 v8, v62, v8
@@ -23625,11 +23635,12 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v19, v21, v19
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v58
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v61
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v58
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
@@ -23643,7 +23654,8 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v56
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v42
; SI-NEXT: v_cvt_f32_f16_e32 v4, v37
; SI-NEXT: v_cvt_f32_f16_e32 v5, v35
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -23657,7 +23669,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v3, v38
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v33
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v7, v24
@@ -23684,7 +23696,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v56
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v25
@@ -23808,20 +23820,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v59, v48
+; SI-NEXT: v_mov_b32_e32 v42, v48
; SI-NEXT: v_mov_b32_e32 v48, v21
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v49
+; SI-NEXT: v_mov_b32_e32 v58, v49
; SI-NEXT: v_mov_b32_e32 v49, v20
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v43, v50
; SI-NEXT: v_mov_b32_e32 v50, v22
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v44, v51
; SI-NEXT: v_mov_b32_e32 v51, v23
@@ -23829,22 +23841,21 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v52, v27
; SI-NEXT: v_mov_b32_e32 v46, v53
; SI-NEXT: v_mov_b32_e32 v53, v28
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v60, v33
; SI-NEXT: v_mov_b32_e32 v41, v32
; SI-NEXT: v_mov_b32_e32 v33, v47
; SI-NEXT: v_mov_b32_e32 v47, v54
; SI-NEXT: v_mov_b32_e32 v54, v29
-; SI-NEXT: v_mov_b32_e32 v42, v56
; SI-NEXT: v_mov_b32_e32 v56, v55
; SI-NEXT: v_mov_b32_e32 v55, v30
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v36, v57
+; SI-NEXT: v_mov_b32_e32 v59, v57
; SI-NEXT: v_mov_b32_e32 v57, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v39, v58
-; SI-NEXT: v_mov_b32_e32 v58, v37
+; SI-NEXT: v_mov_b32_e32 v36, v39
+; SI-NEXT: v_mov_b32_e32 v39, v37
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: v_mov_b32_e32 v34, v24
; SI-NEXT: v_mov_b32_e32 v32, v38
@@ -23856,34 +23867,34 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v38, v32
; SI-NEXT: v_mov_b32_e32 v24, v34
; SI-NEXT: v_mov_b32_e32 v34, v37
-; SI-NEXT: v_mov_b32_e32 v37, v58
-; SI-NEXT: v_mov_b32_e32 v58, v39
-; SI-NEXT: v_mov_b32_e32 v31, v40
-; SI-NEXT: v_mov_b32_e32 v40, v57
-; SI-NEXT: v_mov_b32_e32 v57, v36
+; SI-NEXT: v_mov_b32_e32 v37, v39
+; SI-NEXT: v_mov_b32_e32 v39, v36
; SI-NEXT: v_mov_b32_e32 v30, v55
; SI-NEXT: v_mov_b32_e32 v55, v56
-; SI-NEXT: v_mov_b32_e32 v56, v42
+; SI-NEXT: v_mov_b32_e32 v29, v54
+; SI-NEXT: v_mov_b32_e32 v54, v47
+; SI-NEXT: v_mov_b32_e32 v47, v33
; SI-NEXT: v_mov_b32_e32 v32, v41
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v60
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v23, v51
; SI-NEXT: v_mov_b32_e32 v51, v44
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v50
; SI-NEXT: v_mov_b32_e32 v50, v43
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v20, v49
-; SI-NEXT: v_mov_b32_e32 v49, v60
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v49, v58
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v21, v48
-; SI-NEXT: v_mov_b32_e32 v48, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v48, v42
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v29, v54
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v47, v33
+; SI-NEXT: v_mov_b32_e32 v31, v40
+; SI-NEXT: v_mov_b32_e32 v40, v57
+; SI-NEXT: v_mov_b32_e32 v57, v59
; SI-NEXT: v_mov_b32_e32 v28, v53
; SI-NEXT: v_mov_b32_e32 v53, v46
; SI-NEXT: v_mov_b32_e32 v27, v52
@@ -30093,7 +30104,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v62, v2
; SI-NEXT: v_cvt_f16_f32_e32 v44, v5
; SI-NEXT: v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: v_cvt_f16_f32_e32 v41, v6
; SI-NEXT: v_cvt_f16_f32_e32 v40, v9
; SI-NEXT: v_cvt_f16_f32_e32 v55, v8
@@ -30113,26 +30124,26 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v20, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v60, s16
-; SI-NEXT: v_cvt_f16_f32_e32 v59, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v57, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v61, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v57, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v60, s19
+; SI-NEXT: v_cvt_f16_f32_e32 v58, s18
; SI-NEXT: v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v56, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v42, s20
; SI-NEXT: v_cvt_f16_f32_e32 v38, s23
; SI-NEXT: v_cvt_f16_f32_e32 v37, s22
; SI-NEXT: v_cvt_f16_f32_e32 v36, s25
; SI-NEXT: v_cvt_f16_f32_e32 v35, s24
; SI-NEXT: v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v61, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v33, s26
; SI-NEXT: v_cvt_f16_f32_e32 v25, s29
; SI-NEXT: v_cvt_f16_f32_e32 v24, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB55_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36
@@ -30141,7 +30152,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52
@@ -30151,12 +30162,12 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT: v_or_b32_e32 v0, v60, v0
-; SI-NEXT: v_or_b32_e32 v1, v57, v1
-; SI-NEXT: v_or_b32_e32 v2, v56, v2
+; SI-NEXT: v_or_b32_e32 v0, v57, v0
+; SI-NEXT: v_or_b32_e32 v1, v58, v1
+; SI-NEXT: v_or_b32_e32 v2, v42, v2
; SI-NEXT: v_or_b32_e32 v3, v37, v3
; SI-NEXT: v_or_b32_e32 v4, v35, v4
-; SI-NEXT: v_or_b32_e32 v5, v61, v5
+; SI-NEXT: v_or_b32_e32 v5, v33, v5
; SI-NEXT: v_or_b32_e32 v6, v24, v6
; SI-NEXT: v_or_b32_e32 v7, v47, v7
; SI-NEXT: v_or_b32_e32 v8, v62, v8
@@ -30173,11 +30184,12 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v19, v21, v19
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v58
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v61
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v57
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v58
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
@@ -30191,7 +30203,8 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v56
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v42
; SI-NEXT: v_cvt_f32_f16_e32 v4, v37
; SI-NEXT: v_cvt_f32_f16_e32 v5, v35
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
@@ -30205,7 +30218,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v3, v38
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v33
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v7, v24
@@ -30232,7 +30245,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v56
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v25
@@ -30356,20 +30369,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB55_4:
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v59, v48
+; SI-NEXT: v_mov_b32_e32 v42, v48
; SI-NEXT: v_mov_b32_e32 v48, v21
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v49
+; SI-NEXT: v_mov_b32_e32 v58, v49
; SI-NEXT: v_mov_b32_e32 v49, v20
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v43, v50
; SI-NEXT: v_mov_b32_e32 v50, v22
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v44, v51
; SI-NEXT: v_mov_b32_e32 v51, v23
@@ -30377,22 +30390,21 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v52, v27
; SI-NEXT: v_mov_b32_e32 v46, v53
; SI-NEXT: v_mov_b32_e32 v53, v28
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v60, v33
; SI-NEXT: v_mov_b32_e32 v41, v32
; SI-NEXT: v_mov_b32_e32 v33, v47
; SI-NEXT: v_mov_b32_e32 v47, v54
; SI-NEXT: v_mov_b32_e32 v54, v29
-; SI-NEXT: v_mov_b32_e32 v42, v56
; SI-NEXT: v_mov_b32_e32 v56, v55
; SI-NEXT: v_mov_b32_e32 v55, v30
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v36, v57
+; SI-NEXT: v_mov_b32_e32 v59, v57
; SI-NEXT: v_mov_b32_e32 v57, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v39, v58
-; SI-NEXT: v_mov_b32_e32 v58, v37
+; SI-NEXT: v_mov_b32_e32 v36, v39
+; SI-NEXT: v_mov_b32_e32 v39, v37
; SI-NEXT: v_mov_b32_e32 v37, v34
; SI-NEXT: v_mov_b32_e32 v34, v24
; SI-NEXT: v_mov_b32_e32 v32, v38
@@ -30404,34 +30416,34 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v38, v32
; SI-NEXT: v_mov_b32_e32 v24, v34
; SI-NEXT: v_mov_b32_e32 v34, v37
-; SI-NEXT: v_mov_b32_e32 v37, v58
-; SI-NEXT: v_mov_b32_e32 v58, v39
-; SI-NEXT: v_mov_b32_e32 v31, v40
-; SI-NEXT: v_mov_b32_e32 v40, v57
-; SI-NEXT: v_mov_b32_e32 v57, v36
+; SI-NEXT: v_mov_b32_e32 v37, v39
+; SI-NEXT: v_mov_b32_e32 v39, v36
; SI-NEXT: v_mov_b32_e32 v30, v55
; SI-NEXT: v_mov_b32_e32 v55, v56
-; SI-NEXT: v_mov_b32_e32 v56, v42
+; SI-NEXT: v_mov_b32_e32 v29, v54
+; SI-NEXT: v_mov_b32_e32 v54, v47
+; SI-NEXT: v_mov_b32_e32 v47, v33
; SI-NEXT: v_mov_b32_e32 v32, v41
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v60
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v23, v51
; SI-NEXT: v_mov_b32_e32 v51, v44
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v50
; SI-NEXT: v_mov_b32_e32 v50, v43
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v20, v49
-; SI-NEXT: v_mov_b32_e32 v49, v60
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v49, v58
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v21, v48
-; SI-NEXT: v_mov_b32_e32 v48, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v48, v42
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v29, v54
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v47, v33
+; SI-NEXT: v_mov_b32_e32 v31, v40
+; SI-NEXT: v_mov_b32_e32 v40, v57
+; SI-NEXT: v_mov_b32_e32 v57, v59
; SI-NEXT: v_mov_b32_e32 v28, v53
; SI-NEXT: v_mov_b32_e32 v53, v46
; SI-NEXT: v_mov_b32_e32 v27, v52
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index b73870977c429..50b13db0e15d4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2889,7 +2889,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
@@ -2897,8 +2897,8 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3
; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3
@@ -2908,10 +2908,10 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8
; GFX11-TRUE16-NEXT: .LBB25_3: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
@@ -2920,7 +2920,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB25_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16
@@ -2932,7 +2932,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
@@ -2940,8 +2940,8 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3
; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3
@@ -2951,16 +2951,16 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8
; GFX11-FAKE16-NEXT: .LBB25_3: ; %end
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s8
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB25_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
@@ -9107,7 +9107,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
@@ -9115,8 +9115,8 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3
; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3
@@ -9126,10 +9126,10 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8
; GFX11-TRUE16-NEXT: .LBB69_3: ; %end
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
@@ -9138,7 +9138,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB69_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16
@@ -9150,7 +9150,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
@@ -9158,8 +9158,8 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3
; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true
; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3
@@ -9169,16 +9169,16 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8
; GFX11-FAKE16-NEXT: .LBB69_3: ; %end
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s8
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB69_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2
; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
@@ -13747,47 +13747,45 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) {
; VI-LABEL: bitcast_v4i16_to_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: ; implicit-def: $vgpr4
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
+; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr8
; VI-NEXT: ; implicit-def: $vgpr5
; VI-NEXT: ; implicit-def: $vgpr7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; VI-NEXT: v_mov_b32_e32 v9, v0
-; VI-NEXT: v_mov_b32_e32 v8, v1
-; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9]
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; VI-NEXT: ; %bb.2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB96_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_mov_b32_e32 v2, 3
-; VI-NEXT: v_add_u16_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v8, 3, v1
+; VI-NEXT: v_mov_b32_e32 v0, 3
+; VI-NEXT: v_add_u16_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v10, 3, v9
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; VI-NEXT: v_add_u16_e32 v9, 3, v0
+; VI-NEXT: v_add_u16_e32 v9, 3, v8
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT: v_or_b32_e32 v1, v8, v1
+; VI-NEXT: v_or_b32_e32 v1, v10, v1
; VI-NEXT: v_or_b32_e32 v0, v9, v0
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; VI-NEXT: v_bfe_u32 v7, v6, 8, 8
+; VI-NEXT: v_mov_b32_e32 v8, v9
+; VI-NEXT: v_mov_b32_e32 v9, v10
; VI-NEXT: .LBB96_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, v9
-; VI-NEXT: v_mov_b32_e32 v1, v4
-; VI-NEXT: v_mov_b32_e32 v4, v8
+; VI-NEXT: v_mov_b32_e32 v0, v8
+; VI-NEXT: v_mov_b32_e32 v4, v9
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v4i16_to_v8i8:
@@ -13984,48 +13982,47 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
; VI-NEXT: s_cbranch_scc0 .LBB97_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
-; VI-NEXT: s_lshr_b32 s10, s17, 24
+; VI-NEXT: s_lshr_b32 s9, s17, 24
; VI-NEXT: s_lshr_b32 s8, s17, 16
; VI-NEXT: s_lshr_b32 s5, s17, 8
-; VI-NEXT: s_lshr_b32 s11, s16, 16
-; VI-NEXT: s_lshr_b32 s12, s16, 8
-; VI-NEXT: s_mov_b32 s9, s17
+; VI-NEXT: s_lshr_b32 s10, s16, 16
+; VI-NEXT: s_lshr_b32 s11, s16, 8
; VI-NEXT: s_cbranch_execnz .LBB97_3
; VI-NEXT: .LBB97_2: ; %cmp.true
-; VI-NEXT: s_lshr_b32 s5, s17, 16
-; VI-NEXT: s_add_i32 s9, s17, 3
-; VI-NEXT: s_add_i32 s8, s5, 3
-; VI-NEXT: s_and_b32 s4, s9, 0xffff
-; VI-NEXT: s_lshl_b32 s5, s8, 16
-; VI-NEXT: s_or_b32 s7, s4, s5
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshr_b32 s6, s17, 16
+; VI-NEXT: s_add_i32 s4, s17, 3
+; VI-NEXT: s_add_i32 s8, s6, 3
+; VI-NEXT: s_and_b32 s5, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s8, 16
+; VI-NEXT: s_or_b32 s7, s5, s6
+; VI-NEXT: s_and_b32 s5, s16, 0xffff0000
; VI-NEXT: s_add_i32 s16, s16, 3
-; VI-NEXT: s_and_b32 s5, s16, 0xffff
-; VI-NEXT: s_or_b32 s4, s4, s5
-; VI-NEXT: s_add_i32 s6, s4, 0x30000
+; VI-NEXT: s_and_b32 s6, s16, 0xffff
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_add_i32 s6, s5, 0x30000
+; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24
; VI-NEXT: s_lshr_b32 s5, s7, 8
-; VI-NEXT: s_lshr_b32 s11, s6, 16
-; VI-NEXT: s_lshr_b32 s12, s6, 8
-; VI-NEXT: s_bfe_u32 s10, s8, 0x80008
+; VI-NEXT: s_lshr_b32 s10, s6, 16
+; VI-NEXT: s_lshr_b32 s11, s6, 8
+; VI-NEXT: s_bfe_u32 s9, s8, 0x80008
; VI-NEXT: .LBB97_3: ; %end
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v4, s17
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v6, s8
-; VI-NEXT: v_mov_b32_e32 v7, s10
+; VI-NEXT: v_mov_b32_e32 v7, s9
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB97_4:
-; VI-NEXT: ; implicit-def: $sgpr12
; VI-NEXT: ; implicit-def: $sgpr11
+; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr4
-; VI-NEXT: ; implicit-def: $sgpr9
; VI-NEXT: ; implicit-def: $sgpr5
; VI-NEXT: ; implicit-def: $sgpr8
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr9
; VI-NEXT: s_branch .LBB97_2
;
; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar:
@@ -16967,77 +16964,78 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v9.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v0 :: v_dual_lshlrev_b32 v0, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v12, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v3, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[10:11]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 2a96722ccce0b..51ed72a3a16fe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3642,7 +3642,7 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -3664,11 +3664,11 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -3711,19 +3711,22 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s58
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s61
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s59
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47
@@ -3753,10 +3756,10 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
@@ -7383,7 +7386,7 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -7405,11 +7408,11 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -7452,19 +7455,22 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s58
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s61
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s59
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47
@@ -7494,10 +7500,10 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
@@ -8775,13 +8781,13 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v1
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v3
; SI-NEXT: v_cvt_f16_f32_e32 v38, v2
; SI-NEXT: v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
; SI-NEXT: v_cvt_f16_f32_e32 v35, v7
; SI-NEXT: v_cvt_f16_f32_e32 v50, v6
; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
@@ -8809,37 +8815,36 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v51, s17
; SI-NEXT: v_cvt_f16_f32_e32 v33, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v34, s18
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s21
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v62, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v63, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v62, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v34, s22
; SI-NEXT: v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v36, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v49, s24
; SI-NEXT: v_cvt_f16_f32_e32 v29, s27
; SI-NEXT: v_cvt_f16_f32_e32 v28, s26
; SI-NEXT: v_cvt_f16_f32_e32 v27, s29
; SI-NEXT: v_cvt_f16_f32_e32 v26, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
@@ -8854,14 +8859,14 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22
; SI-NEXT: v_or_b32_e32 v0, v33, v0
-; SI-NEXT: v_or_b32_e32 v1, v34, v1
-; SI-NEXT: v_or_b32_e32 v3, v62, v3
-; SI-NEXT: v_or_b32_e32 v4, v36, v4
+; SI-NEXT: v_or_b32_e32 v2, v63, v2
+; SI-NEXT: v_or_b32_e32 v3, v34, v3
+; SI-NEXT: v_or_b32_e32 v4, v49, v4
; SI-NEXT: v_or_b32_e32 v5, v28, v5
; SI-NEXT: v_or_b32_e32 v6, v26, v6
-; SI-NEXT: v_or_b32_e32 v7, v48, v7
+; SI-NEXT: v_or_b32_e32 v7, v60, v7
; SI-NEXT: v_or_b32_e32 v8, v38, v8
-; SI-NEXT: v_or_b32_e32 v9, v49, v9
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
; SI-NEXT: v_or_b32_e32 v10, v50, v10
; SI-NEXT: v_or_b32_e32 v11, v59, v11
; SI-NEXT: v_or_b32_e32 v12, v57, v12
@@ -8877,65 +8882,64 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v51
; SI-NEXT: v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v62
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v49
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v60
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_cvt_f32_f16_e32 v10, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v36
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_cvt_f32_f16_e32 v14, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_cvt_f32_f16_e32 v18, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v21, v31
@@ -8949,25 +8953,27 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v62
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -8987,12 +8993,12 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v39
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
@@ -9080,86 +9086,89 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB19_4:
; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v59, v46
; SI-NEXT: v_mov_b32_e32 v46, v41
; SI-NEXT: v_mov_b32_e32 v41, v52
; SI-NEXT: v_mov_b32_e32 v52, v23
-; SI-NEXT: v_mov_b32_e32 v48, v60
+; SI-NEXT: v_mov_b32_e32 v48, v39
+; SI-NEXT: v_mov_b32_e32 v39, v60
; SI-NEXT: v_mov_b32_e32 v60, v47
; SI-NEXT: v_mov_b32_e32 v47, v42
; SI-NEXT: v_mov_b32_e32 v42, v53
; SI-NEXT: v_mov_b32_e32 v53, v22
-; SI-NEXT: v_mov_b32_e32 v35, v61
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v34, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v43
; SI-NEXT: v_mov_b32_e32 v43, v54
; SI-NEXT: v_mov_b32_e32 v54, v24
-; SI-NEXT: v_mov_b32_e32 v50, v34
-; SI-NEXT: v_mov_b32_e32 v34, v62
+; SI-NEXT: v_mov_b32_e32 v50, v62
; SI-NEXT: v_mov_b32_e32 v62, v57
; SI-NEXT: v_mov_b32_e32 v57, v44
; SI-NEXT: v_mov_b32_e32 v44, v55
; SI-NEXT: v_mov_b32_e32 v55, v25
-; SI-NEXT: v_mov_b32_e32 v32, v33
+; SI-NEXT: v_mov_b32_e32 v32, v51
+; SI-NEXT: v_mov_b32_e32 v51, v33
; SI-NEXT: v_mov_b32_e32 v33, v63
; SI-NEXT: v_mov_b32_e32 v63, v58
; SI-NEXT: v_mov_b32_e32 v58, v45
; SI-NEXT: v_mov_b32_e32 v45, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: v_mov_b32_e32 v39, v26
-; SI-NEXT: v_mov_b32_e32 v38, v27
-; SI-NEXT: v_mov_b32_e32 v37, v28
-; SI-NEXT: v_mov_b32_e32 v49, v36
-; SI-NEXT: v_mov_b32_e32 v36, v29
+; SI-NEXT: v_mov_b32_e32 v38, v26
+; SI-NEXT: v_mov_b32_e32 v37, v27
+; SI-NEXT: v_mov_b32_e32 v36, v28
+; SI-NEXT: v_mov_b32_e32 v35, v49
+; SI-NEXT: v_mov_b32_e32 v49, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v29, v36
-; SI-NEXT: v_mov_b32_e32 v36, v49
-; SI-NEXT: v_mov_b32_e32 v28, v37
-; SI-NEXT: v_mov_b32_e32 v27, v38
-; SI-NEXT: v_mov_b32_e32 v26, v39
+; SI-NEXT: v_mov_b32_e32 v29, v49
+; SI-NEXT: v_mov_b32_e32 v49, v35
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_mov_b32_e32 v27, v37
+; SI-NEXT: v_mov_b32_e32 v26, v38
; SI-NEXT: v_mov_b32_e32 v31, v40
; SI-NEXT: v_mov_b32_e32 v40, v45
; SI-NEXT: v_mov_b32_e32 v45, v58
; SI-NEXT: v_mov_b32_e32 v58, v63
; SI-NEXT: v_mov_b32_e32 v63, v33
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v33, v51
+; SI-NEXT: v_mov_b32_e32 v51, v32
; SI-NEXT: v_mov_b32_e32 v25, v55
; SI-NEXT: v_mov_b32_e32 v55, v44
; SI-NEXT: v_mov_b32_e32 v44, v57
; SI-NEXT: v_mov_b32_e32 v57, v62
-; SI-NEXT: v_mov_b32_e32 v62, v34
-; SI-NEXT: v_mov_b32_e32 v34, v50
+; SI-NEXT: v_mov_b32_e32 v62, v50
; SI-NEXT: v_mov_b32_e32 v24, v54
; SI-NEXT: v_mov_b32_e32 v54, v43
; SI-NEXT: v_mov_b32_e32 v43, v56
; SI-NEXT: v_mov_b32_e32 v56, v61
-; SI-NEXT: v_mov_b32_e32 v61, v35
+; SI-NEXT: v_mov_b32_e32 v61, v34
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v53
; SI-NEXT: v_mov_b32_e32 v53, v42
; SI-NEXT: v_mov_b32_e32 v42, v47
; SI-NEXT: v_mov_b32_e32 v47, v60
-; SI-NEXT: v_mov_b32_e32 v60, v48
+; SI-NEXT: v_mov_b32_e32 v60, v39
+; SI-NEXT: v_mov_b32_e32 v39, v48
; SI-NEXT: v_mov_b32_e32 v23, v52
; SI-NEXT: v_mov_b32_e32 v52, v41
; SI-NEXT: v_mov_b32_e32 v41, v46
; SI-NEXT: v_mov_b32_e32 v46, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_branch .LBB19_2
;
@@ -17665,13 +17674,13 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v1
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v3
; SI-NEXT: v_cvt_f16_f32_e32 v38, v2
; SI-NEXT: v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
; SI-NEXT: v_cvt_f16_f32_e32 v35, v7
; SI-NEXT: v_cvt_f16_f32_e32 v50, v6
; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
@@ -17699,37 +17708,36 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v51, s17
; SI-NEXT: v_cvt_f16_f32_e32 v33, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v34, s18
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s21
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v62, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v63, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v62, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v34, s22
; SI-NEXT: v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v36, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v49, s24
; SI-NEXT: v_cvt_f16_f32_e32 v29, s27
; SI-NEXT: v_cvt_f16_f32_e32 v28, s26
; SI-NEXT: v_cvt_f16_f32_e32 v27, s29
; SI-NEXT: v_cvt_f16_f32_e32 v26, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
@@ -17744,14 +17752,14 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22
; SI-NEXT: v_or_b32_e32 v0, v33, v0
-; SI-NEXT: v_or_b32_e32 v1, v34, v1
-; SI-NEXT: v_or_b32_e32 v3, v62, v3
-; SI-NEXT: v_or_b32_e32 v4, v36, v4
+; SI-NEXT: v_or_b32_e32 v2, v63, v2
+; SI-NEXT: v_or_b32_e32 v3, v34, v3
+; SI-NEXT: v_or_b32_e32 v4, v49, v4
; SI-NEXT: v_or_b32_e32 v5, v28, v5
; SI-NEXT: v_or_b32_e32 v6, v26, v6
-; SI-NEXT: v_or_b32_e32 v7, v48, v7
+; SI-NEXT: v_or_b32_e32 v7, v60, v7
; SI-NEXT: v_or_b32_e32 v8, v38, v8
-; SI-NEXT: v_or_b32_e32 v9, v49, v9
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
; SI-NEXT: v_or_b32_e32 v10, v50, v10
; SI-NEXT: v_or_b32_e32 v11, v59, v11
; SI-NEXT: v_or_b32_e32 v12, v57, v12
@@ -17767,65 +17775,64 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v51
; SI-NEXT: v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v62
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v49
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v60
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_cvt_f32_f16_e32 v10, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v36
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_cvt_f32_f16_e32 v14, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_cvt_f32_f16_e32 v18, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v21, v31
@@ -17839,25 +17846,27 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v62
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -17877,12 +17886,12 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v39
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
@@ -17970,86 +17979,89 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB35_4:
; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v59, v46
; SI-NEXT: v_mov_b32_e32 v46, v41
; SI-NEXT: v_mov_b32_e32 v41, v52
; SI-NEXT: v_mov_b32_e32 v52, v23
-; SI-NEXT: v_mov_b32_e32 v48, v60
+; SI-NEXT: v_mov_b32_e32 v48, v39
+; SI-NEXT: v_mov_b32_e32 v39, v60
; SI-NEXT: v_mov_b32_e32 v60, v47
; SI-NEXT: v_mov_b32_e32 v47, v42
; SI-NEXT: v_mov_b32_e32 v42, v53
; SI-NEXT: v_mov_b32_e32 v53, v22
-; SI-NEXT: v_mov_b32_e32 v35, v61
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v34, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v43
; SI-NEXT: v_mov_b32_e32 v43, v54
; SI-NEXT: v_mov_b32_e32 v54, v24
-; SI-NEXT: v_mov_b32_e32 v50, v34
-; SI-NEXT: v_mov_b32_e32 v34, v62
+; SI-NEXT: v_mov_b32_e32 v50, v62
; SI-NEXT: v_mov_b32_e32 v62, v57
; SI-NEXT: v_mov_b32_e32 v57, v44
; SI-NEXT: v_mov_b32_e32 v44, v55
; SI-NEXT: v_mov_b32_e32 v55, v25
-; SI-NEXT: v_mov_b32_e32 v32, v33
+; SI-NEXT: v_mov_b32_e32 v32, v51
+; SI-NEXT: v_mov_b32_e32 v51, v33
; SI-NEXT: v_mov_b32_e32 v33, v63
; SI-NEXT: v_mov_b32_e32 v63, v58
; SI-NEXT: v_mov_b32_e32 v58, v45
; SI-NEXT: v_mov_b32_e32 v45, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: v_mov_b32_e32 v39, v26
-; SI-NEXT: v_mov_b32_e32 v38, v27
-; SI-NEXT: v_mov_b32_e32 v37, v28
-; SI-NEXT: v_mov_b32_e32 v49, v36
-; SI-NEXT: v_mov_b32_e32 v36, v29
+; SI-NEXT: v_mov_b32_e32 v38, v26
+; SI-NEXT: v_mov_b32_e32 v37, v27
+; SI-NEXT: v_mov_b32_e32 v36, v28
+; SI-NEXT: v_mov_b32_e32 v35, v49
+; SI-NEXT: v_mov_b32_e32 v49, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v29, v36
-; SI-NEXT: v_mov_b32_e32 v36, v49
-; SI-NEXT: v_mov_b32_e32 v28, v37
-; SI-NEXT: v_mov_b32_e32 v27, v38
-; SI-NEXT: v_mov_b32_e32 v26, v39
+; SI-NEXT: v_mov_b32_e32 v29, v49
+; SI-NEXT: v_mov_b32_e32 v49, v35
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_mov_b32_e32 v27, v37
+; SI-NEXT: v_mov_b32_e32 v26, v38
; SI-NEXT: v_mov_b32_e32 v31, v40
; SI-NEXT: v_mov_b32_e32 v40, v45
; SI-NEXT: v_mov_b32_e32 v45, v58
; SI-NEXT: v_mov_b32_e32 v58, v63
; SI-NEXT: v_mov_b32_e32 v63, v33
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v33, v51
+; SI-NEXT: v_mov_b32_e32 v51, v32
; SI-NEXT: v_mov_b32_e32 v25, v55
; SI-NEXT: v_mov_b32_e32 v55, v44
; SI-NEXT: v_mov_b32_e32 v44, v57
; SI-NEXT: v_mov_b32_e32 v57, v62
-; SI-NEXT: v_mov_b32_e32 v62, v34
-; SI-NEXT: v_mov_b32_e32 v34, v50
+; SI-NEXT: v_mov_b32_e32 v62, v50
; SI-NEXT: v_mov_b32_e32 v24, v54
; SI-NEXT: v_mov_b32_e32 v54, v43
; SI-NEXT: v_mov_b32_e32 v43, v56
; SI-NEXT: v_mov_b32_e32 v56, v61
-; SI-NEXT: v_mov_b32_e32 v61, v35
+; SI-NEXT: v_mov_b32_e32 v61, v34
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v53
; SI-NEXT: v_mov_b32_e32 v53, v42
; SI-NEXT: v_mov_b32_e32 v42, v47
; SI-NEXT: v_mov_b32_e32 v47, v60
-; SI-NEXT: v_mov_b32_e32 v60, v48
+; SI-NEXT: v_mov_b32_e32 v60, v39
+; SI-NEXT: v_mov_b32_e32 v39, v48
; SI-NEXT: v_mov_b32_e32 v23, v52
; SI-NEXT: v_mov_b32_e32 v52, v41
; SI-NEXT: v_mov_b32_e32 v41, v46
; SI-NEXT: v_mov_b32_e32 v46, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_branch .LBB35_2
;
@@ -20712,7 +20724,7 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -20734,11 +20746,11 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -20781,19 +20793,22 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s58
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s61
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s59
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47
@@ -20823,10 +20838,10 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
@@ -24465,7 +24480,7 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0
; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -24487,11 +24502,11 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -24534,19 +24549,22 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s58
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14
; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28
; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s61
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s58
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s59
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47
@@ -24576,10 +24594,10 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16
@@ -25857,13 +25875,13 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v1
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v3
; SI-NEXT: v_cvt_f16_f32_e32 v38, v2
; SI-NEXT: v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
; SI-NEXT: v_cvt_f16_f32_e32 v35, v7
; SI-NEXT: v_cvt_f16_f32_e32 v50, v6
; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
@@ -25891,37 +25909,36 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v51, s17
; SI-NEXT: v_cvt_f16_f32_e32 v33, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v34, s18
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s21
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v62, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v63, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v62, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v34, s22
; SI-NEXT: v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v36, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v49, s24
; SI-NEXT: v_cvt_f16_f32_e32 v29, s27
; SI-NEXT: v_cvt_f16_f32_e32 v28, s26
; SI-NEXT: v_cvt_f16_f32_e32 v27, s29
; SI-NEXT: v_cvt_f16_f32_e32 v26, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
@@ -25936,14 +25953,14 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22
; SI-NEXT: v_or_b32_e32 v0, v33, v0
-; SI-NEXT: v_or_b32_e32 v1, v34, v1
-; SI-NEXT: v_or_b32_e32 v3, v62, v3
-; SI-NEXT: v_or_b32_e32 v4, v36, v4
+; SI-NEXT: v_or_b32_e32 v2, v63, v2
+; SI-NEXT: v_or_b32_e32 v3, v34, v3
+; SI-NEXT: v_or_b32_e32 v4, v49, v4
; SI-NEXT: v_or_b32_e32 v5, v28, v5
; SI-NEXT: v_or_b32_e32 v6, v26, v6
-; SI-NEXT: v_or_b32_e32 v7, v48, v7
+; SI-NEXT: v_or_b32_e32 v7, v60, v7
; SI-NEXT: v_or_b32_e32 v8, v38, v8
-; SI-NEXT: v_or_b32_e32 v9, v49, v9
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
; SI-NEXT: v_or_b32_e32 v10, v50, v10
; SI-NEXT: v_or_b32_e32 v11, v59, v11
; SI-NEXT: v_or_b32_e32 v12, v57, v12
@@ -25959,65 +25976,64 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v51
; SI-NEXT: v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v62
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v49
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v60
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_cvt_f32_f16_e32 v10, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v36
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_cvt_f32_f16_e32 v14, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_cvt_f32_f16_e32 v18, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v21, v31
@@ -26031,25 +26047,27 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v62
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -26069,12 +26087,12 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v39
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
@@ -26162,86 +26180,89 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v59, v46
; SI-NEXT: v_mov_b32_e32 v46, v41
; SI-NEXT: v_mov_b32_e32 v41, v52
; SI-NEXT: v_mov_b32_e32 v52, v23
-; SI-NEXT: v_mov_b32_e32 v48, v60
+; SI-NEXT: v_mov_b32_e32 v48, v39
+; SI-NEXT: v_mov_b32_e32 v39, v60
; SI-NEXT: v_mov_b32_e32 v60, v47
; SI-NEXT: v_mov_b32_e32 v47, v42
; SI-NEXT: v_mov_b32_e32 v42, v53
; SI-NEXT: v_mov_b32_e32 v53, v22
-; SI-NEXT: v_mov_b32_e32 v35, v61
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v34, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v43
; SI-NEXT: v_mov_b32_e32 v43, v54
; SI-NEXT: v_mov_b32_e32 v54, v24
-; SI-NEXT: v_mov_b32_e32 v50, v34
-; SI-NEXT: v_mov_b32_e32 v34, v62
+; SI-NEXT: v_mov_b32_e32 v50, v62
; SI-NEXT: v_mov_b32_e32 v62, v57
; SI-NEXT: v_mov_b32_e32 v57, v44
; SI-NEXT: v_mov_b32_e32 v44, v55
; SI-NEXT: v_mov_b32_e32 v55, v25
-; SI-NEXT: v_mov_b32_e32 v32, v33
+; SI-NEXT: v_mov_b32_e32 v32, v51
+; SI-NEXT: v_mov_b32_e32 v51, v33
; SI-NEXT: v_mov_b32_e32 v33, v63
; SI-NEXT: v_mov_b32_e32 v63, v58
; SI-NEXT: v_mov_b32_e32 v58, v45
; SI-NEXT: v_mov_b32_e32 v45, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: v_mov_b32_e32 v39, v26
-; SI-NEXT: v_mov_b32_e32 v38, v27
-; SI-NEXT: v_mov_b32_e32 v37, v28
-; SI-NEXT: v_mov_b32_e32 v49, v36
-; SI-NEXT: v_mov_b32_e32 v36, v29
+; SI-NEXT: v_mov_b32_e32 v38, v26
+; SI-NEXT: v_mov_b32_e32 v37, v27
+; SI-NEXT: v_mov_b32_e32 v36, v28
+; SI-NEXT: v_mov_b32_e32 v35, v49
+; SI-NEXT: v_mov_b32_e32 v49, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v29, v36
-; SI-NEXT: v_mov_b32_e32 v36, v49
-; SI-NEXT: v_mov_b32_e32 v28, v37
-; SI-NEXT: v_mov_b32_e32 v27, v38
-; SI-NEXT: v_mov_b32_e32 v26, v39
+; SI-NEXT: v_mov_b32_e32 v29, v49
+; SI-NEXT: v_mov_b32_e32 v49, v35
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_mov_b32_e32 v27, v37
+; SI-NEXT: v_mov_b32_e32 v26, v38
; SI-NEXT: v_mov_b32_e32 v31, v40
; SI-NEXT: v_mov_b32_e32 v40, v45
; SI-NEXT: v_mov_b32_e32 v45, v58
; SI-NEXT: v_mov_b32_e32 v58, v63
; SI-NEXT: v_mov_b32_e32 v63, v33
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v33, v51
+; SI-NEXT: v_mov_b32_e32 v51, v32
; SI-NEXT: v_mov_b32_e32 v25, v55
; SI-NEXT: v_mov_b32_e32 v55, v44
; SI-NEXT: v_mov_b32_e32 v44, v57
; SI-NEXT: v_mov_b32_e32 v57, v62
-; SI-NEXT: v_mov_b32_e32 v62, v34
-; SI-NEXT: v_mov_b32_e32 v34, v50
+; SI-NEXT: v_mov_b32_e32 v62, v50
; SI-NEXT: v_mov_b32_e32 v24, v54
; SI-NEXT: v_mov_b32_e32 v54, v43
; SI-NEXT: v_mov_b32_e32 v43, v56
; SI-NEXT: v_mov_b32_e32 v56, v61
-; SI-NEXT: v_mov_b32_e32 v61, v35
+; SI-NEXT: v_mov_b32_e32 v61, v34
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v53
; SI-NEXT: v_mov_b32_e32 v53, v42
; SI-NEXT: v_mov_b32_e32 v42, v47
; SI-NEXT: v_mov_b32_e32 v47, v60
-; SI-NEXT: v_mov_b32_e32 v60, v48
+; SI-NEXT: v_mov_b32_e32 v60, v39
+; SI-NEXT: v_mov_b32_e32 v39, v48
; SI-NEXT: v_mov_b32_e32 v23, v52
; SI-NEXT: v_mov_b32_e32 v52, v41
; SI-NEXT: v_mov_b32_e32 v41, v46
; SI-NEXT: v_mov_b32_e32 v46, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_branch .LBB47_2
;
@@ -33110,13 +33131,13 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v1
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v3
; SI-NEXT: v_cvt_f16_f32_e32 v38, v2
; SI-NEXT: v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
; SI-NEXT: v_cvt_f16_f32_e32 v35, v7
; SI-NEXT: v_cvt_f16_f32_e32 v50, v6
; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
@@ -33144,37 +33165,36 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v51, s17
; SI-NEXT: v_cvt_f16_f32_e32 v33, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v34, s18
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s21
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v62, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v63, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v62, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v34, s22
; SI-NEXT: v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v36, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v49, s24
; SI-NEXT: v_cvt_f16_f32_e32 v29, s27
; SI-NEXT: v_cvt_f16_f32_e32 v28, s26
; SI-NEXT: v_cvt_f16_f32_e32 v27, s29
; SI-NEXT: v_cvt_f16_f32_e32 v26, s28
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
@@ -33189,14 +33209,14 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22
; SI-NEXT: v_or_b32_e32 v0, v33, v0
-; SI-NEXT: v_or_b32_e32 v1, v34, v1
-; SI-NEXT: v_or_b32_e32 v3, v62, v3
-; SI-NEXT: v_or_b32_e32 v4, v36, v4
+; SI-NEXT: v_or_b32_e32 v2, v63, v2
+; SI-NEXT: v_or_b32_e32 v3, v34, v3
+; SI-NEXT: v_or_b32_e32 v4, v49, v4
; SI-NEXT: v_or_b32_e32 v5, v28, v5
; SI-NEXT: v_or_b32_e32 v6, v26, v6
-; SI-NEXT: v_or_b32_e32 v7, v48, v7
+; SI-NEXT: v_or_b32_e32 v7, v60, v7
; SI-NEXT: v_or_b32_e32 v8, v38, v8
-; SI-NEXT: v_or_b32_e32 v9, v49, v9
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
; SI-NEXT: v_or_b32_e32 v10, v50, v10
; SI-NEXT: v_or_b32_e32 v11, v59, v11
; SI-NEXT: v_or_b32_e32 v12, v57, v12
@@ -33212,65 +33232,64 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v51
; SI-NEXT: v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v62
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v49
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v60
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
; SI-NEXT: v_cvt_f32_f16_e32 v10, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v36
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v50
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v58
; SI-NEXT: v_cvt_f32_f16_e32 v14, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v47
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v44
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
; SI-NEXT: v_cvt_f32_f16_e32 v18, v41
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v53
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v21, v31
@@ -33284,25 +33303,27 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v62
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -33322,12 +33343,12 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v39
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v48
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
@@ -33415,86 +33436,89 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB55_4:
; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v59, v46
; SI-NEXT: v_mov_b32_e32 v46, v41
; SI-NEXT: v_mov_b32_e32 v41, v52
; SI-NEXT: v_mov_b32_e32 v52, v23
-; SI-NEXT: v_mov_b32_e32 v48, v60
+; SI-NEXT: v_mov_b32_e32 v48, v39
+; SI-NEXT: v_mov_b32_e32 v39, v60
; SI-NEXT: v_mov_b32_e32 v60, v47
; SI-NEXT: v_mov_b32_e32 v47, v42
; SI-NEXT: v_mov_b32_e32 v42, v53
; SI-NEXT: v_mov_b32_e32 v53, v22
-; SI-NEXT: v_mov_b32_e32 v35, v61
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v34, v61
; SI-NEXT: v_mov_b32_e32 v61, v56
; SI-NEXT: v_mov_b32_e32 v56, v43
; SI-NEXT: v_mov_b32_e32 v43, v54
; SI-NEXT: v_mov_b32_e32 v54, v24
-; SI-NEXT: v_mov_b32_e32 v50, v34
-; SI-NEXT: v_mov_b32_e32 v34, v62
+; SI-NEXT: v_mov_b32_e32 v50, v62
; SI-NEXT: v_mov_b32_e32 v62, v57
; SI-NEXT: v_mov_b32_e32 v57, v44
; SI-NEXT: v_mov_b32_e32 v44, v55
; SI-NEXT: v_mov_b32_e32 v55, v25
-; SI-NEXT: v_mov_b32_e32 v32, v33
+; SI-NEXT: v_mov_b32_e32 v32, v51
+; SI-NEXT: v_mov_b32_e32 v51, v33
; SI-NEXT: v_mov_b32_e32 v33, v63
; SI-NEXT: v_mov_b32_e32 v63, v58
; SI-NEXT: v_mov_b32_e32 v58, v45
; SI-NEXT: v_mov_b32_e32 v45, v40
; SI-NEXT: v_mov_b32_e32 v40, v31
-; SI-NEXT: v_mov_b32_e32 v39, v26
-; SI-NEXT: v_mov_b32_e32 v38, v27
-; SI-NEXT: v_mov_b32_e32 v37, v28
-; SI-NEXT: v_mov_b32_e32 v49, v36
-; SI-NEXT: v_mov_b32_e32 v36, v29
+; SI-NEXT: v_mov_b32_e32 v38, v26
+; SI-NEXT: v_mov_b32_e32 v37, v27
+; SI-NEXT: v_mov_b32_e32 v36, v28
+; SI-NEXT: v_mov_b32_e32 v35, v49
+; SI-NEXT: v_mov_b32_e32 v49, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v29, v36
-; SI-NEXT: v_mov_b32_e32 v36, v49
-; SI-NEXT: v_mov_b32_e32 v28, v37
-; SI-NEXT: v_mov_b32_e32 v27, v38
-; SI-NEXT: v_mov_b32_e32 v26, v39
+; SI-NEXT: v_mov_b32_e32 v29, v49
+; SI-NEXT: v_mov_b32_e32 v49, v35
+; SI-NEXT: v_mov_b32_e32 v28, v36
+; SI-NEXT: v_mov_b32_e32 v27, v37
+; SI-NEXT: v_mov_b32_e32 v26, v38
; SI-NEXT: v_mov_b32_e32 v31, v40
; SI-NEXT: v_mov_b32_e32 v40, v45
; SI-NEXT: v_mov_b32_e32 v45, v58
; SI-NEXT: v_mov_b32_e32 v58, v63
; SI-NEXT: v_mov_b32_e32 v63, v33
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v33, v51
+; SI-NEXT: v_mov_b32_e32 v51, v32
; SI-NEXT: v_mov_b32_e32 v25, v55
; SI-NEXT: v_mov_b32_e32 v55, v44
; SI-NEXT: v_mov_b32_e32 v44, v57
; SI-NEXT: v_mov_b32_e32 v57, v62
-; SI-NEXT: v_mov_b32_e32 v62, v34
-; SI-NEXT: v_mov_b32_e32 v34, v50
+; SI-NEXT: v_mov_b32_e32 v62, v50
; SI-NEXT: v_mov_b32_e32 v24, v54
; SI-NEXT: v_mov_b32_e32 v54, v43
; SI-NEXT: v_mov_b32_e32 v43, v56
; SI-NEXT: v_mov_b32_e32 v56, v61
-; SI-NEXT: v_mov_b32_e32 v61, v35
+; SI-NEXT: v_mov_b32_e32 v61, v34
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v22, v53
; SI-NEXT: v_mov_b32_e32 v53, v42
; SI-NEXT: v_mov_b32_e32 v42, v47
; SI-NEXT: v_mov_b32_e32 v47, v60
-; SI-NEXT: v_mov_b32_e32 v60, v48
+; SI-NEXT: v_mov_b32_e32 v60, v39
+; SI-NEXT: v_mov_b32_e32 v39, v48
; SI-NEXT: v_mov_b32_e32 v23, v52
; SI-NEXT: v_mov_b32_e32 v52, v41
; SI-NEXT: v_mov_b32_e32 v41, v46
; SI-NEXT: v_mov_b32_e32 v46, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_branch .LBB55_2
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index a2bd1d30cc634..d84d3230f9538 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -3868,7 +3868,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -3892,11 +3892,11 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -3943,16 +3943,18 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s74
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s72
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s63
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59
@@ -3987,10 +3989,10 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
@@ -6144,9 +6146,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -6162,7 +6164,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -6178,9 +6180,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -6197,9 +6199,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -6223,9 +6225,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -8055,7 +8057,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -8079,11 +8081,11 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -8130,16 +8132,18 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s74
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s72
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s63
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59
@@ -8174,10 +8178,10 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
@@ -10603,9 +10607,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -10621,7 +10625,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -10637,9 +10641,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -10656,9 +10660,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -10682,9 +10686,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -15932,9 +15936,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -15950,7 +15954,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -15966,9 +15970,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -15985,9 +15989,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -16011,9 +16015,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -20387,9 +20391,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -20405,7 +20409,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -20421,9 +20425,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -20440,9 +20444,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -20466,9 +20470,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -22703,7 +22707,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -22727,11 +22731,11 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -22778,16 +22782,18 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s74
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s72
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s63
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59
@@ -22822,10 +22828,10 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
@@ -24979,9 +24985,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -24997,7 +25003,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -25013,9 +25019,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -25032,9 +25038,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -25058,9 +25064,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -26902,7 +26908,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0
; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -26926,11 +26932,11 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -26977,16 +26983,18 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s74
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s62
+; GFX11-TRUE16-NEXT: s_mov_b32 s62, s72
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s63
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s62
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59
@@ -27021,10 +27029,10 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16
@@ -29450,9 +29458,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -29468,7 +29476,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -29484,9 +29492,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -29503,9 +29511,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -29529,9 +29537,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -33147,9 +33155,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -33165,7 +33173,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -33181,9 +33189,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -33200,9 +33208,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -33226,9 +33234,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -37520,9 +37528,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
@@ -37538,7 +37546,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
@@ -37554,9 +37562,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -37573,9 +37581,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
@@ -37599,9 +37607,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -39746,16 +39754,17 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v7, 16, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
@@ -39765,7 +39774,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
@@ -39779,7 +39788,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s17
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s7, 3 op_sel_hi:[1,0]
@@ -39807,18 +39816,18 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
@@ -39828,16 +39837,15 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v53.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v55.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
@@ -39855,15 +39863,15 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15
@@ -39880,31 +39888,30 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s9
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v27
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v9
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v49, 16, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v23
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v14
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v21, 16, v16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v26
@@ -39912,8 +39919,9 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v23
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v23, 16, v17
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v19, 16, v22
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v30, 16, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -39927,7 +39935,6 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v48, 16, v50
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v37, 16, v38
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v11
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1
@@ -39936,7 +39943,9 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v29
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
@@ -41941,16 +41950,17 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v7, 16, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
@@ -41960,7 +41970,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
@@ -41974,7 +41984,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s17
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s7 op_sel_hi:[0,1]
@@ -42002,18 +42012,18 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
@@ -42023,16 +42033,15 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v53.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v55.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
@@ -42050,15 +42059,15 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15
@@ -42075,31 +42084,30 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s9
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v27
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v9
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v49, 16, v51
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v23
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v22
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v14
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v21, 16, v16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v26
@@ -42107,8 +42115,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v23
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v23, 16, v17
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v19, 16, v22
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v30, 16, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -42122,7 +42131,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v48, 16, v50
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v37, 16, v38
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v11
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1
@@ -42131,7 +42139,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v29
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index fbee320c82c7f..f4b487be4316b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -4145,7 +4145,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -4171,11 +4171,11 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -4226,16 +4226,18 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s78
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s77
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s76
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s75
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s74
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63
@@ -4273,10 +4275,10 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
@@ -6638,8 +6640,8 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -6656,7 +6658,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -6672,9 +6674,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -6693,9 +6695,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -6721,9 +6723,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -8756,7 +8758,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -8782,11 +8784,11 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -8837,16 +8839,18 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s78
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s77
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s76
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s75
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s74
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63
@@ -8884,10 +8888,10 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
@@ -10464,15 +10468,15 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v4
; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v52, v12
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
@@ -10490,13 +10494,13 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v27, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v29
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v53, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s18
; SI-NEXT: v_cvt_f16_f32_e32 v12, s21
; SI-NEXT: v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
@@ -10538,9 +10542,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
@@ -10551,17 +10555,18 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; SI-NEXT: v_or_b32_e32 v0, v11, v0
; SI-NEXT: v_or_b32_e32 v2, v14, v2
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
@@ -10569,11 +10574,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
@@ -10585,10 +10591,10 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v51, v46
; SI-NEXT: v_or_b32_e32 v7, v45, v7
; SI-NEXT: v_or_b32_e32 v8, v40, v8
-; SI-NEXT: v_or_b32_e32 v9, v55, v9
+; SI-NEXT: v_or_b32_e32 v9, v42, v9
; SI-NEXT: v_or_b32_e32 v10, v54, v10
-; SI-NEXT: v_or_b32_e32 v11, v47, v11
-; SI-NEXT: v_or_b32_e32 v12, v60, v12
+; SI-NEXT: v_or_b32_e32 v11, v57, v11
+; SI-NEXT: v_or_b32_e32 v12, v53, v12
; SI-NEXT: v_or_b32_e32 v13, v52, v13
; SI-NEXT: v_or_b32_e32 v14, v63, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
@@ -10616,14 +10622,16 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v42
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v11, v54
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -10635,8 +10643,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v53
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
; SI-NEXT: v_cvt_f32_f16_e32 v16, v63
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
@@ -10736,7 +10744,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
@@ -10774,7 +10782,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v55
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -10784,12 +10792,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -11532,8 +11540,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -11550,7 +11558,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -11566,9 +11574,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -11587,9 +11595,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -11615,9 +11623,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -17336,8 +17344,8 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -17354,7 +17362,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -17370,9 +17378,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -17391,9 +17399,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -17419,9 +17427,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -18588,9 +18596,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: s_lshr_b32 s4, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: s_lshr_b32 s4, s11, 16
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v60, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v9, s4
; SI-NEXT: s_lshr_b32 s4, s12, 16
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_cvt_f32_f16_e32 v59, s4
; SI-NEXT: s_lshr_b32 s4, s13, 16
; SI-NEXT: v_cvt_f32_f16_e32 v14, s4
@@ -18634,8 +18642,8 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v32, s6
; SI-NEXT: v_cvt_f32_f16_e32 v34, s7
; SI-NEXT: v_cvt_f32_f16_e32 v36, s8
-; SI-NEXT: v_cvt_f32_f16_e32 v38, s10
-; SI-NEXT: v_cvt_f32_f16_e32 v48, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v46, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v38, s11
; SI-NEXT: v_cvt_f32_f16_e32 v15, s12
; SI-NEXT: v_cvt_f32_f16_e32 v17, s13
; SI-NEXT: v_cvt_f32_f16_e32 v19, s14
@@ -18650,7 +18658,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v12, s24
; SI-NEXT: v_cvt_f32_f16_e32 v13, s23
; SI-NEXT: v_cvt_f32_f16_e32 v11, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v9, s21
+; SI-NEXT: v_cvt_f32_f16_e32 v50, s21
; SI-NEXT: v_cvt_f32_f16_e32 v52, s20
; SI-NEXT: v_cvt_f32_f16_e32 v53, s19
; SI-NEXT: v_cvt_f32_f16_e32 v55, s18
@@ -18659,7 +18667,16 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: s_cbranch_execnz .LBB33_3
; SI-NEXT: .LBB33_2: ; %cmp.true
; SI-NEXT: v_add_f32_e64 v1, s16, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_add_f32_e64 v6, s6, 1.0
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f32_e64 v2, s17, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e64 v3, s18, 1.0
; SI-NEXT: v_add_f32_e64 v5, s19, 1.0
; SI-NEXT: v_add_f32_e64 v7, s20, 1.0
@@ -18682,47 +18699,36 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_add_f32_e64 v12, s10, 1.0
; SI-NEXT: v_add_f32_e64 v10, s8, 1.0
; SI-NEXT: v_add_f32_e64 v8, s7, 1.0
-; SI-NEXT: v_add_f32_e64 v6, s6, 1.0
; SI-NEXT: v_add_f32_e64 v29, s9, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23
; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19
-; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17
-; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v15
-; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13
-; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v12
-; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v45, v29
; SI-NEXT: v_cvt_f32_f16_e32 v32, v6
; SI-NEXT: v_cvt_f32_f16_e32 v34, v8
; SI-NEXT: v_cvt_f32_f16_e32 v36, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v13
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
@@ -18737,38 +18743,37 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v12, v16
; SI-NEXT: v_cvt_f32_f16_e32 v13, v14
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v9
; SI-NEXT: v_cvt_f32_f16_e32 v52, v7
; SI-NEXT: v_cvt_f32_f16_e32 v53, v5
; SI-NEXT: v_cvt_f32_f16_e32 v55, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v2
; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v4
; SI-NEXT: v_cvt_f32_f16_e32 v3, v63
; SI-NEXT: v_cvt_f32_f16_e32 v5, v62
; SI-NEXT: v_cvt_f32_f16_e32 v7, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v60
; SI-NEXT: v_cvt_f32_f16_e32 v59, v59
; SI-NEXT: v_cvt_f32_f16_e32 v14, v58
; SI-NEXT: v_cvt_f32_f16_e32 v16, v57
; SI-NEXT: v_cvt_f32_f16_e32 v18, v56
; SI-NEXT: v_cvt_f32_f16_e32 v20, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v44
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v44
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v4
; SI-NEXT: .LBB33_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v4, v44
; SI-NEXT: v_cvt_f16_f32_e32 v30, v43
@@ -18805,7 +18810,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v50
; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v30, v4
@@ -18909,15 +18914,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v38
; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v46
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v6, v4
@@ -18979,7 +18984,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr39
@@ -19009,9 +19014,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr5
@@ -21163,15 +21168,15 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v4
; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v52, v12
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
@@ -21189,13 +21194,13 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v27, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v29
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v53, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s18
; SI-NEXT: v_cvt_f16_f32_e32 v12, s21
; SI-NEXT: v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
@@ -21237,9 +21242,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_4
@@ -21250,17 +21255,18 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; SI-NEXT: v_or_b32_e32 v0, v11, v0
; SI-NEXT: v_or_b32_e32 v2, v14, v2
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
@@ -21268,11 +21274,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
@@ -21284,10 +21291,10 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v51, v46
; SI-NEXT: v_or_b32_e32 v7, v45, v7
; SI-NEXT: v_or_b32_e32 v8, v40, v8
-; SI-NEXT: v_or_b32_e32 v9, v55, v9
+; SI-NEXT: v_or_b32_e32 v9, v42, v9
; SI-NEXT: v_or_b32_e32 v10, v54, v10
-; SI-NEXT: v_or_b32_e32 v11, v47, v11
-; SI-NEXT: v_or_b32_e32 v12, v60, v12
+; SI-NEXT: v_or_b32_e32 v11, v57, v11
+; SI-NEXT: v_or_b32_e32 v12, v53, v12
; SI-NEXT: v_or_b32_e32 v13, v52, v13
; SI-NEXT: v_or_b32_e32 v14, v63, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
@@ -21315,14 +21322,16 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v42
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v11, v54
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -21334,8 +21343,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v53
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
; SI-NEXT: v_cvt_f32_f16_e32 v16, v63
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
@@ -21435,7 +21444,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
@@ -21473,7 +21482,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v55
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -21483,12 +21492,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -22231,8 +22240,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -22249,7 +22258,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -22265,9 +22274,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -22286,9 +22295,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -22314,9 +22323,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -24742,7 +24751,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -24768,11 +24777,11 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -24823,16 +24832,18 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s78
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s77
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s76
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s75
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s74
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63
@@ -24870,10 +24881,10 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
@@ -27235,8 +27246,8 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -27253,7 +27264,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -27269,9 +27280,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -27290,9 +27301,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -27318,9 +27329,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -29368,7 +29379,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0
; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -29394,11 +29405,11 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -29449,16 +29460,18 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s78
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s77
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s74
+; GFX11-TRUE16-NEXT: s_mov_b32 s74, s76
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s75
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s74
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63
@@ -29496,10 +29509,10 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16
@@ -31076,15 +31089,15 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v4
; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v52, v12
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
@@ -31102,13 +31115,13 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v27, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v29
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v53, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s18
; SI-NEXT: v_cvt_f16_f32_e32 v12, s21
; SI-NEXT: v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
@@ -31150,9 +31163,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
@@ -31163,17 +31176,18 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; SI-NEXT: v_or_b32_e32 v0, v11, v0
; SI-NEXT: v_or_b32_e32 v2, v14, v2
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
@@ -31181,11 +31195,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
@@ -31197,10 +31212,10 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v51, v46
; SI-NEXT: v_or_b32_e32 v7, v45, v7
; SI-NEXT: v_or_b32_e32 v8, v40, v8
-; SI-NEXT: v_or_b32_e32 v9, v55, v9
+; SI-NEXT: v_or_b32_e32 v9, v42, v9
; SI-NEXT: v_or_b32_e32 v10, v54, v10
-; SI-NEXT: v_or_b32_e32 v11, v47, v11
-; SI-NEXT: v_or_b32_e32 v12, v60, v12
+; SI-NEXT: v_or_b32_e32 v11, v57, v11
+; SI-NEXT: v_or_b32_e32 v12, v53, v12
; SI-NEXT: v_or_b32_e32 v13, v52, v13
; SI-NEXT: v_or_b32_e32 v14, v63, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
@@ -31228,14 +31243,16 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v42
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v11, v54
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -31247,8 +31264,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v53
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
; SI-NEXT: v_cvt_f32_f16_e32 v16, v63
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
@@ -31348,7 +31365,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
@@ -31386,7 +31403,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v55
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -31396,12 +31413,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -32144,8 +32161,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -32162,7 +32179,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -32178,9 +32195,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -32199,9 +32216,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -32227,9 +32244,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -36228,8 +36245,8 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -36246,7 +36263,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -36262,9 +36279,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -36283,9 +36300,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -36311,9 +36328,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -37469,12 +37486,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v42, s40
; SI-NEXT: s_lshr_b32 s40, s16, 16
; SI-NEXT: v_cvt_f32_f16_e32 v44, s40
-; SI-NEXT: v_cvt_f32_f16_e32 v46, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
; SI-NEXT: v_cvt_f32_f16_e32 v8, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s6
; SI-NEXT: v_cvt_f32_f16_e32 v12, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v45, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v13, s8
; SI-NEXT: v_cvt_f32_f16_e32 v15, s11
; SI-NEXT: v_cvt_f32_f16_e32 v17, s10
; SI-NEXT: v_cvt_f32_f16_e32 v19, s13
@@ -37500,22 +37517,24 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0
; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v2
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55
; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v54
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v49
; SI-NEXT: v_cvt_f32_f16_e32 v53, v55
; SI-NEXT: v_cvt_f32_f16_e32 v55, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v40
; SI-NEXT: v_cvt_f32_f16_e32 v54, v42
-; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v44
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0
; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0
; SI-NEXT: v_add_f64 v[31:32], s[26:27], 1.0
@@ -37525,8 +37544,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0
; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33
@@ -37545,13 +37564,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v7
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v11
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v17, v14
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
@@ -37567,9 +37587,10 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v48, v37
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v2
; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v45
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
@@ -37588,191 +37609,187 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v37, v47
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v51
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v44
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v44
; SI-NEXT: .LBB53_3: ; %end
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v43
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42
-; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: v_or_b32_e32 v43, v43, v44
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0
-; SI-NEXT: v_or_b32_e32 v13, v41, v13
-; SI-NEXT: buffer_store_dword v13, v10, s[0:3], 0 offen
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v40
+; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0
+; SI-NEXT: v_or_b32_e32 v41, v41, v42
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v55
-; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v55, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0
+; SI-NEXT: v_or_b32_e32 v55, v55, v40
+; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v53
-; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0
+; SI-NEXT: v_or_b32_e32 v53, v53, v54
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v52
-; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v51, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0
+; SI-NEXT: v_or_b32_e32 v51, v52, v51
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
+; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v50
-; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0
+; SI-NEXT: v_or_b32_e32 v49, v50, v49
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v48
-; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v39, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0
+; SI-NEXT: v_or_b32_e32 v39, v48, v39
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v38
-; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0
+; SI-NEXT: v_or_b32_e32 v37, v38, v37
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v36
-; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0
+; SI-NEXT: v_or_b32_e32 v35, v36, v35
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v34
-; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v33, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0
+; SI-NEXT: v_or_b32_e32 v33, v34, v33
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v32
-; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0
+; SI-NEXT: v_or_b32_e32 v31, v32, v31
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v30
-; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0
+; SI-NEXT: v_or_b32_e32 v28, v30, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v29
-; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v26, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v28, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v28, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v27
-; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v27
+; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v26, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v25
-; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v25
+; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v24, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v23
-; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v23
+; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v22, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v21
-; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v20, v21
+; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v20, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v19
-; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v19
+; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v18, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v14
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v17
-; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v13, v10
-; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v16, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v15
-; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_or_b32_e32 v10, v11, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v14, v11
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v45
-; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v11, v13
+; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v10, v9
+; SI-NEXT: v_or_b32_e32 v9, v11, v9
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v9, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_add_i32_e32 v10, vcc, 0x54, v0
; SI-NEXT: v_or_b32_e32 v7, v9, v7
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v10
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v7, v5
+; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v8
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v4
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -37836,17 +37853,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_branch .LBB53_2
;
@@ -39964,15 +39981,15 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v4
; SI-NEXT: v_cvt_f16_f32_e32 v56, v7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v52, v12
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
@@ -39990,13 +40007,13 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v27, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v29
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v53, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v43, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v2, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v3, s18
; SI-NEXT: v_cvt_f16_f32_e32 v12, s21
; SI-NEXT: v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
@@ -40038,9 +40055,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_4
@@ -40051,17 +40068,18 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
+; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; SI-NEXT: v_or_b32_e32 v0, v11, v0
; SI-NEXT: v_or_b32_e32 v2, v14, v2
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
@@ -40069,11 +40087,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v55
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v47
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15
@@ -40085,10 +40104,10 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v51, v46
; SI-NEXT: v_or_b32_e32 v7, v45, v7
; SI-NEXT: v_or_b32_e32 v8, v40, v8
-; SI-NEXT: v_or_b32_e32 v9, v55, v9
+; SI-NEXT: v_or_b32_e32 v9, v42, v9
; SI-NEXT: v_or_b32_e32 v10, v54, v10
-; SI-NEXT: v_or_b32_e32 v11, v47, v11
-; SI-NEXT: v_or_b32_e32 v12, v60, v12
+; SI-NEXT: v_or_b32_e32 v11, v57, v11
+; SI-NEXT: v_or_b32_e32 v12, v53, v12
; SI-NEXT: v_or_b32_e32 v13, v52, v13
; SI-NEXT: v_or_b32_e32 v14, v63, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
@@ -40116,14 +40135,16 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_or_b32_e32 v25, v38, v25
; SI-NEXT: s_cbranch_execnz .LBB55_3
; SI-NEXT: .LBB55_2: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v43
; SI-NEXT: v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v42
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v11, v54
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -40135,8 +40156,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v53
; SI-NEXT: v_cvt_f32_f16_e32 v15, v52
; SI-NEXT: v_cvt_f32_f16_e32 v16, v63
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
@@ -40236,7 +40257,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
@@ -40274,7 +40295,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v55
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
@@ -40284,12 +40305,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -41032,8 +41053,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
@@ -41050,7 +41071,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
@@ -41066,9 +41087,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
@@ -41087,9 +41108,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
@@ -41115,9 +41136,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -42485,12 +42506,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
@@ -42522,70 +42543,75 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB57_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_cvt_f32_f16_e32 v31, s22
+; SI-NEXT: v_mov_b32_e32 v46, v44
+; SI-NEXT: v_cvt_f32_f16_e32 v62, v44
+; SI-NEXT: v_mov_b32_e32 v44, v58
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v31, s24
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
; SI-NEXT: v_cvt_f32_f16_e32 v33, s16
; SI-NEXT: v_cvt_f32_f16_e32 v49, s17
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s24
+; SI-NEXT: v_cvt_f32_f16_e32 v31, s26
; SI-NEXT: v_cvt_f32_f16_e32 v34, s18
; SI-NEXT: v_cvt_f32_f16_e32 v50, s19
; SI-NEXT: v_cvt_f32_f16_e32 v60, s20
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s26
+; SI-NEXT: v_cvt_f32_f16_e32 v31, s28
; SI-NEXT: v_cvt_f32_f16_e32 v35, s21
; SI-NEXT: v_cvt_f32_f16_e32 v63, s23
; SI-NEXT: v_cvt_f32_f16_e32 v61, s25
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s28
+; SI-NEXT: v_cvt_f32_f16_e32 v31, s29
; SI-NEXT: v_cvt_f32_f16_e32 v59, s27
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: v_cvt_f32_f16_e32 v42, v19
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s29
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
; SI-NEXT: v_cvt_f32_f16_e32 v41, v21
; SI-NEXT: v_cvt_f32_f16_e32 v55, v23
; SI-NEXT: v_cvt_f32_f16_e32 v53, v25
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v2
; SI-NEXT: v_cvt_f32_f16_e32 v40, v26
; SI-NEXT: v_cvt_f32_f16_e32 v51, v27
; SI-NEXT: v_cvt_f32_f16_e32 v54, v28
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v57
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v56
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v37
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v62, v45
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v44
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
+; SI-NEXT: v_mov_b32_e32 v56, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v6
+; SI-NEXT: v_mov_b32_e32 v45, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v7
@@ -42638,6 +42664,24 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: .LBB57_2:
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: v_mov_b32_e32 v56, v32
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: v_mov_b32_e32 v46, v44
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: v_mov_b32_e32 v45, v36
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: v_mov_b32_e32 v44, v58
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
@@ -42704,24 +42748,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: .LBB57_3: ; %Flow
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v58, v62
; SI-NEXT: v_mov_b32_e32 v62, v32
; SI-NEXT: v_mov_b32_e32 v32, v37
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v37, v39
; SI-NEXT: v_mov_b32_e32 v39, v51
; SI-NEXT: v_mov_b32_e32 v51, v53
@@ -42732,6 +42764,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43
+; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v44
+; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v45
+; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v46
+; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47
+; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v56
+; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v57
; SI-NEXT: s_add_i32 s22, s22, 3
; SI-NEXT: v_cvt_f32_f16_e32 v31, s22
; SI-NEXT: s_add_i32 s24, s24, 3
@@ -42741,29 +42779,23 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, s24
; SI-NEXT: s_add_i32 s29, s29, 3
-; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44
+; SI-NEXT: s_add_i32 s27, s27, 3
+; SI-NEXT: s_add_i32 s25, s25, 3
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, s26
-; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45
-; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s28
-; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47
-; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, s29
-; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57
-; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_add_i32 s25, s25, 3
; SI-NEXT: s_add_i32 s23, s23, 3
; SI-NEXT: s_add_i32 s21, s21, 3
; SI-NEXT: s_add_i32 s20, s20, 3
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v31, s28
; SI-NEXT: s_add_i32 s19, s19, 3
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_add_i32 s17, s17, 3
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v31, s29
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_cvt_f32_f16_e32 v33, s16
; SI-NEXT: v_cvt_f32_f16_e32 v49, s17
@@ -42775,7 +42807,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v61, s25
; SI-NEXT: v_cvt_f32_f16_e32 v59, s27
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v57
; SI-NEXT: v_cvt_f32_f16_e32 v48, v56
; SI-NEXT: v_cvt_f32_f16_e32 v62, v47
; SI-NEXT: v_cvt_f32_f16_e32 v38, v46
@@ -42784,8 +42815,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v42
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1
+; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v57
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
@@ -43651,13 +43683,16 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v10, 16, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3
@@ -43665,9 +43700,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s15, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
@@ -43681,9 +43716,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s7, 3 op_sel_hi:[1,0]
@@ -43712,14 +43747,14 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s3, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28
@@ -43736,16 +43771,13 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v67.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
@@ -43768,8 +43800,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s44
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s43
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41
@@ -43826,8 +43858,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v26
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v48, 16, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v21, 16, v18
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v23, 16, v2
@@ -45215,7 +45247,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v5, v56
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v44, s17
+; SI-NEXT: v_cvt_f16_f32_e32 v45, s17
; SI-NEXT: v_cvt_f16_f32_e32 v43, s21
; SI-NEXT: v_cvt_f16_f32_e32 v42, s25
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -45225,25 +45257,26 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB59_3
; SI-NEXT: .LBB59_2: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v44
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v45
; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
; SI-NEXT: v_cvt_f16_f32_e32 v45, v43
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44
+; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v30
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44
+; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v30
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42
; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42
-; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v44
+; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v45
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v45, v41
; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v42
; SI-NEXT: v_cvt_f32_f16_e32 v42, v55
@@ -45415,13 +45448,12 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v22
; SI-NEXT: v_or_b32_e32 v25, v25, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v44
+; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v45
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v40
; SI-NEXT: v_or_b32_e32 v24, v24, v29
; SI-NEXT: v_or_b32_e32 v27, v27, v43
-; SI-NEXT: v_or_b32_e32 v26, v26, v45
+; SI-NEXT: v_or_b32_e32 v26, v26, v44
; SI-NEXT: v_or_b32_e32 v21, v21, v30
; SI-NEXT: v_or_b32_e32 v20, v20, v41
; SI-NEXT: v_or_b32_e32 v49, v49, v46
@@ -45433,8 +45465,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v11, v11, v56
; SI-NEXT: v_or_b32_e32 v6, v6, v28
; SI-NEXT: v_or_b32_e32 v4, v4, v57
-; SI-NEXT: v_alignbit_b32 v44, v24, v43, 16
-; SI-NEXT: v_alignbit_b32 v43, v25, v45, 16
+; SI-NEXT: v_alignbit_b32 v45, v24, v43, 16
+; SI-NEXT: v_alignbit_b32 v43, v25, v44, 16
; SI-NEXT: v_alignbit_b32 v42, v19, v30, 16
; SI-NEXT: v_alignbit_b32 v30, v50, v41, 16
; SI-NEXT: v_alignbit_b32 v41, v48, v46, 16
@@ -45448,7 +45480,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; SI-NEXT: v_alignbit_b32 v28, v5, v57, 16
; SI-NEXT: .LBB59_3: ; %end
; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44
+; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v45
; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_or_b32_e32 v27, v27, v44
@@ -46085,13 +46117,16 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v10, 16, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3
@@ -46099,9 +46134,9 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s15, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
@@ -46115,9 +46150,9 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s7 op_sel_hi:[0,1]
@@ -46146,14 +46181,14 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s3 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v29
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28
@@ -46170,16 +46205,13 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v67.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
@@ -46202,8 +46234,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s44
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s43
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41
@@ -46260,8 +46292,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v26
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v48, 16, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v21, 16, v18
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v23, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 462e50ac8412c..6c2d2b33a04b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -4447,7 +4447,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -4475,11 +4475,11 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -4534,16 +4534,18 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s90
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s89
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s79
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75
@@ -4584,10 +4586,10 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
@@ -5632,8 +5634,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -5660,8 +5662,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -5711,8 +5713,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr61
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr35
+; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
@@ -5802,8 +5804,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -9480,7 +9482,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -9508,11 +9510,11 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -9567,16 +9569,18 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s90
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s89
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s79
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75
@@ -9617,10 +9621,10 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
@@ -11338,15 +11342,15 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v12
; SI-NEXT: v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
@@ -11439,9 +11443,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v49, v2
+; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -11450,9 +11454,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v61, v44
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_mov_b32_e32 v39, v11
+; SI-NEXT: v_mov_b32_e32 v48, v11
; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_mov_b32_e32 v33, v10
+; SI-NEXT: v_mov_b32_e32 v35, v10
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_or_b32_e32 v5, v8, v5
@@ -11462,7 +11466,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
@@ -11477,9 +11481,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v40, v56
; SI-NEXT: v_or_b32_e32 v10, v56, v10
; SI-NEXT: v_or_b32_e32 v11, v45, v11
-; SI-NEXT: v_or_b32_e32 v12, v38, v12
-; SI-NEXT: v_or_b32_e32 v13, v36, v13
-; SI-NEXT: v_or_b32_e32 v14, v35, v14
+; SI-NEXT: v_or_b32_e32 v12, v33, v12
+; SI-NEXT: v_or_b32_e32 v13, v38, v13
+; SI-NEXT: v_or_b32_e32 v14, v36, v14
; SI-NEXT: v_or_b32_e32 v15, v32, v15
; SI-NEXT: v_or_b32_e32 v17, v37, v17
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -11541,10 +11545,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v35
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -11554,7 +11558,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v49
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
@@ -11572,9 +11576,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v36
; SI-NEXT: v_cvt_f32_f16_e32 v17, v32
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
@@ -11687,7 +11691,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v39
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -11827,10 +11831,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB19_4:
-; SI-NEXT: v_mov_b32_e32 v39, v11
-; SI-NEXT: v_mov_b32_e32 v33, v10
-; SI-NEXT: v_mov_b32_e32 v49, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v48, v11
+; SI-NEXT: v_mov_b32_e32 v35, v10
+; SI-NEXT: v_mov_b32_e32 v63, v2
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_mov_b32_e32 v52, v37
; SI-NEXT: v_mov_b32_e32 v37, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -17248,8 +17252,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -17276,8 +17280,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -17327,8 +17331,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr61
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr35
+; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
@@ -17418,8 +17422,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -20129,13 +20133,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v40, s4
; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v41, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
; SI-NEXT: s_lshr_b32 s4, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v43, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
; SI-NEXT: s_lshr_b32 s4, s8, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v45, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
; SI-NEXT: s_lshr_b32 s4, s10, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v47, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_lshr_b32 s4, s11, 16
; SI-NEXT: v_cvt_f32_f16_e32 v54, s4
; SI-NEXT: s_lshr_b32 s4, s12, 16
@@ -20210,83 +20214,98 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v11, s22
; SI-NEXT: v_cvt_f32_f16_e32 v9, s21
; SI-NEXT: v_cvt_f32_f16_e32 v7, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v41, s19
+; SI-NEXT: v_cvt_f32_f16_e32 v43, s18
+; SI-NEXT: v_cvt_f32_f16_e32 v45, s17
+; SI-NEXT: v_cvt_f32_f16_e32 v47, s16
; SI-NEXT: s_cbranch_execnz .LBB33_3
; SI-NEXT: .LBB33_2: ; %cmp.true
; SI-NEXT: v_add_f32_e64 v1, s16, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e64 v2, s17, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e64 v3, s18, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; SI-NEXT: v_add_f32_e64 v14, s11, 1.0
; SI-NEXT: v_add_f32_e64 v36, s6, 1.0
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v54
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v54
; SI-NEXT: v_cvt_f32_f16_e32 v54, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e64 v10, s8, 1.0
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e64 v9, s21, 1.0
+; SI-NEXT: v_add_f32_e64 v15, s24, 1.0
+; SI-NEXT: v_add_f32_e64 v22, s27, 1.0
+; SI-NEXT: v_add_f32_e64 v29, s43, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22
; SI-NEXT: v_add_f32_e64 v26, s29, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29
+; SI-NEXT: v_add_f32_e64 v10, s8, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_cvt_f32_f16_e32 v58, v10
; SI-NEXT: v_cvt_f32_f16_e32 v10, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v44
+; SI-NEXT: v_add_f32_e64 v48, s9, 1.0
; SI-NEXT: v_add_f32_e64 v12, s10, 1.0
-; SI-NEXT: v_add_f32_e64 v33, s7, 1.0
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT: v_add_f32_e64 v3, s18, 1.0
-; SI-NEXT: v_add_f32_e64 v5, s19, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v50
; SI-NEXT: v_add_f32_e64 v7, s20, 1.0
-; SI-NEXT: v_add_f32_e64 v9, s21, 1.0
-; SI-NEXT: v_add_f32_e64 v11, s22, 1.0
; SI-NEXT: v_add_f32_e64 v13, s23, 1.0
-; SI-NEXT: v_add_f32_e64 v15, s24, 1.0
-; SI-NEXT: v_add_f32_e64 v18, s25, 1.0
; SI-NEXT: v_add_f32_e64 v20, s26, 1.0
-; SI-NEXT: v_add_f32_e64 v22, s27, 1.0
-; SI-NEXT: v_add_f32_e64 v24, s28, 1.0
-; SI-NEXT: v_add_f32_e64 v29, s43, 1.0
-; SI-NEXT: v_add_f32_e64 v27, s42, 1.0
; SI-NEXT: v_add_f32_e64 v25, s41, 1.0
; SI-NEXT: v_add_f32_e64 v23, s40, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v40
+; SI-NEXT: v_mov_b32_e32 v40, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v46
+; SI-NEXT: v_add_f32_e64 v5, s19, 1.0
+; SI-NEXT: v_add_f32_e64 v11, s22, 1.0
+; SI-NEXT: v_add_f32_e64 v18, s25, 1.0
+; SI-NEXT: v_add_f32_e64 v24, s28, 1.0
+; SI-NEXT: v_add_f32_e64 v27, s42, 1.0
; SI-NEXT: v_add_f32_e64 v21, s15, 1.0
; SI-NEXT: v_add_f32_e64 v19, s14, 1.0
; SI-NEXT: v_add_f32_e64 v17, s13, 1.0
; SI-NEXT: v_add_f32_e64 v16, s12, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33
-; SI-NEXT: v_add_f32_e64 v48, s9, 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23
+; SI-NEXT: v_add_f32_e64 v33, s7, 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33
; SI-NEXT: v_cvt_f32_f16_e32 v38, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v36
; SI-NEXT: v_cvt_f32_f16_e32 v57, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v12
; SI-NEXT: v_cvt_f32_f16_e32 v60, v14
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
@@ -20297,50 +20316,40 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v33, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v20
; SI-NEXT: v_cvt_f32_f16_e32 v14, v18
; SI-NEXT: v_cvt_f32_f16_e32 v48, v15
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v8
; SI-NEXT: v_cvt_f32_f16_e32 v50, v4
; SI-NEXT: v_cvt_f32_f16_e32 v4, v63
; SI-NEXT: v_cvt_f32_f16_e32 v15, v62
; SI-NEXT: v_cvt_f32_f16_e32 v18, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v45
-; SI-NEXT: v_mov_b32_e32 v45, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v42
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v56
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v56, v6
-; SI-NEXT: v_mov_b32_e32 v47, v8
-; SI-NEXT: v_mov_b32_e32 v43, v34
; SI-NEXT: .LBB33_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v6, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v47
; SI-NEXT: v_cvt_f16_f32_e32 v31, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v45
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v8, v6
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31
@@ -20351,14 +20360,14 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
; SI-NEXT: v_cvt_f16_f32_e32 v6, v44
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v43
; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v8, v6
; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v41
; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v8, v6
@@ -20497,26 +20506,26 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v1
; SI-NEXT: v_cvt_f16_f32_e32 v6, v59
; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v58
; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v57
; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v52
@@ -20550,13 +20559,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB33_4:
-; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr55
@@ -20597,13 +20606,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: s_branch .LBB33_2
@@ -22944,15 +22953,15 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v12
; SI-NEXT: v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
@@ -23045,9 +23054,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v49, v2
+; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -23056,9 +23065,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v61, v44
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_mov_b32_e32 v39, v11
+; SI-NEXT: v_mov_b32_e32 v48, v11
; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_mov_b32_e32 v33, v10
+; SI-NEXT: v_mov_b32_e32 v35, v10
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_or_b32_e32 v5, v8, v5
@@ -23068,7 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
@@ -23083,9 +23092,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_mov_b32_e32 v40, v56
; SI-NEXT: v_or_b32_e32 v10, v56, v10
; SI-NEXT: v_or_b32_e32 v11, v45, v11
-; SI-NEXT: v_or_b32_e32 v12, v38, v12
-; SI-NEXT: v_or_b32_e32 v13, v36, v13
-; SI-NEXT: v_or_b32_e32 v14, v35, v14
+; SI-NEXT: v_or_b32_e32 v12, v33, v12
+; SI-NEXT: v_or_b32_e32 v13, v38, v13
+; SI-NEXT: v_or_b32_e32 v14, v36, v14
; SI-NEXT: v_or_b32_e32 v15, v32, v15
; SI-NEXT: v_or_b32_e32 v17, v37, v17
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -23147,10 +23156,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v35
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -23160,7 +23169,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v49
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
@@ -23178,9 +23187,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v36
; SI-NEXT: v_cvt_f32_f16_e32 v17, v32
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
@@ -23293,7 +23302,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v39
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -23433,10 +23442,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB35_4:
-; SI-NEXT: v_mov_b32_e32 v39, v11
-; SI-NEXT: v_mov_b32_e32 v33, v10
-; SI-NEXT: v_mov_b32_e32 v49, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v48, v11
+; SI-NEXT: v_mov_b32_e32 v35, v10
+; SI-NEXT: v_mov_b32_e32 v63, v2
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_mov_b32_e32 v52, v37
; SI-NEXT: v_mov_b32_e32 v37, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -26820,7 +26829,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -26848,11 +26857,11 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -26907,16 +26916,18 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s90
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s89
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s79
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75
@@ -26957,10 +26968,10 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
@@ -28005,8 +28016,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -28033,8 +28044,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -28084,8 +28095,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr61
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr35
+; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
@@ -28175,8 +28186,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -31867,7 +31878,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0
; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -31895,11 +31906,11 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -31954,16 +31965,18 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s90
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s89
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s78
+; GFX11-TRUE16-NEXT: s_mov_b32 s78, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s79
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75
@@ -32004,10 +32017,10 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16
@@ -33725,15 +33738,15 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v12
; SI-NEXT: v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
@@ -33826,9 +33839,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v49, v2
+; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -33837,9 +33850,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v61, v44
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_mov_b32_e32 v39, v11
+; SI-NEXT: v_mov_b32_e32 v48, v11
; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_mov_b32_e32 v33, v10
+; SI-NEXT: v_mov_b32_e32 v35, v10
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_or_b32_e32 v5, v8, v5
@@ -33849,7 +33862,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
@@ -33864,9 +33877,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v40, v56
; SI-NEXT: v_or_b32_e32 v10, v56, v10
; SI-NEXT: v_or_b32_e32 v11, v45, v11
-; SI-NEXT: v_or_b32_e32 v12, v38, v12
-; SI-NEXT: v_or_b32_e32 v13, v36, v13
-; SI-NEXT: v_or_b32_e32 v14, v35, v14
+; SI-NEXT: v_or_b32_e32 v12, v33, v12
+; SI-NEXT: v_or_b32_e32 v13, v38, v13
+; SI-NEXT: v_or_b32_e32 v14, v36, v14
; SI-NEXT: v_or_b32_e32 v15, v32, v15
; SI-NEXT: v_or_b32_e32 v17, v37, v17
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -33928,10 +33941,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v35
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -33941,7 +33954,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v49
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
@@ -33959,9 +33972,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v36
; SI-NEXT: v_cvt_f32_f16_e32 v17, v32
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
@@ -34074,7 +34087,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v39
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -34214,10 +34227,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
-; SI-NEXT: v_mov_b32_e32 v39, v11
-; SI-NEXT: v_mov_b32_e32 v33, v10
-; SI-NEXT: v_mov_b32_e32 v49, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v48, v11
+; SI-NEXT: v_mov_b32_e32 v35, v10
+; SI-NEXT: v_mov_b32_e32 v63, v2
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_mov_b32_e32 v52, v37
; SI-NEXT: v_mov_b32_e32 v37, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -37812,8 +37825,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
+; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -37840,8 +37853,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38
; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -37891,8 +37904,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr61
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr34
-; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr35
+; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6
@@ -37982,8 +37995,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6
-; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6
+; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6
+; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6
; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6
; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6
; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6
@@ -39530,18 +39543,18 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; kill: killed $vgpr29
; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -39559,13 +39572,13 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v62, v29
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v60, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v59, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v57, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v29
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
@@ -39590,8 +39603,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v27
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15
@@ -39632,7 +39645,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v55, v29
; SI-NEXT: v_mov_b32_e32 v29, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v26
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -39676,41 +39689,35 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB52_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v47
-; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v45
-; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v43
-; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v45
+; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v42
-; SI-NEXT: v_mov_b32_e32 v42, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v43
; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55
@@ -39726,9 +39733,11 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v23
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26
@@ -39769,6 +39778,10 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
; SI-NEXT: v_cvt_f32_f16_e32 v59, v59
; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v47
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v42
; SI-NEXT: v_cvt_f32_f16_e32 v44, v44
; SI-NEXT: v_cvt_f32_f16_e32 v46, v46
; SI-NEXT: v_cvt_f32_f16_e32 v56, v56
@@ -39783,11 +39796,11 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_cvt_f32_f16_e32 v54, v2
-; SI-NEXT: v_mov_b32_e32 v47, v26
-; SI-NEXT: v_mov_b32_e32 v45, v27
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v55, v1
-; SI-NEXT: v_mov_b32_e32 v43, v28
+; SI-NEXT: v_mov_b32_e32 v45, v26
+; SI-NEXT: v_mov_b32_e32 v43, v27
+; SI-NEXT: v_mov_b32_e32 v42, v28
; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -39805,7 +39818,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: .LBB52_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
@@ -39972,7 +39985,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v47
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -39981,7 +39994,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -39990,7 +40003,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -39999,7 +40012,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -40026,21 +40039,21 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v43
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -40638,23 +40651,24 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: s_cbranch_scc0 .LBB53_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s42, s5, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v29, s42
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_cvt_f32_f16_e32 v60, s42
; SI-NEXT: s_lshr_b32 s42, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v25, s42
-; SI-NEXT: s_lshr_b32 s42, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v21, s42
+; SI-NEXT: s_lshr_b32 s42, s7, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s42
; SI-NEXT: s_lshr_b32 s42, s6, 16
; SI-NEXT: v_cvt_f32_f16_e32 v1, s42
; SI-NEXT: s_lshr_b32 s42, s9, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v17, s42
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s42
; SI-NEXT: s_lshr_b32 s42, s8, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s42
+; SI-NEXT: v_cvt_f32_f16_e32 v9, s42
; SI-NEXT: s_lshr_b32 s42, s11, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s42
+; SI-NEXT: v_cvt_f32_f16_e32 v58, s42
; SI-NEXT: s_lshr_b32 s42, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v13, s42
; SI-NEXT: s_lshr_b32 s42, s13, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v48, s42
+; SI-NEXT: v_cvt_f32_f16_e32 v59, s42
; SI-NEXT: s_lshr_b32 s42, s12, 16
; SI-NEXT: v_cvt_f32_f16_e32 v18, s42
; SI-NEXT: s_lshr_b32 s42, s15, 16
@@ -40693,16 +40707,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v46, s42
; SI-NEXT: s_lshr_b32 s42, s16, 16
; SI-NEXT: v_cvt_f32_f16_e32 v56, s42
-; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_cvt_f32_f16_e32 v57, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v33, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v34, s7
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: v_cvt_f32_f16_e32 v58, s6
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_cvt_f32_f16_e32 v59, s9
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v60, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v25, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v29, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v33, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v34, s9
+; SI-NEXT: v_cvt_f32_f16_e32 v48, s8
; SI-NEXT: v_cvt_f32_f16_e32 v16, s11
; SI-NEXT: v_cvt_f32_f16_e32 v7, s10
; SI-NEXT: v_cvt_f32_f16_e32 v19, s13
@@ -40739,14 +40749,23 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v43
-; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0
+; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v57, v4
; SI-NEXT: v_cvt_f32_f16_e32 v4, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v59
+; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0
+; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v60
+; SI-NEXT: v_mov_b32_e32 v60, v22
; SI-NEXT: v_cvt_f32_f16_e32 v22, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v41, v43
@@ -40754,13 +40773,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0
; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0
; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0
-; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v7
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v7
; SI-NEXT: v_cvt_f32_f16_e32 v7, v15
; SI-NEXT: v_cvt_f32_f16_e32 v15, v37
; SI-NEXT: v_cvt_f32_f16_e32 v37, v39
@@ -40774,7 +40792,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v53, v46
; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0
; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0
-; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0
; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35
@@ -40784,20 +40801,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v60, v11
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v11
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v18
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v11, v26
@@ -40811,16 +40824,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v45, v2
; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v14
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v6
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v26, v63
@@ -40828,11 +40839,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v30, v62
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
; SI-NEXT: v_cvt_f32_f16_e32 v35, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v56
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v42, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v56
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v44, v5
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
@@ -40844,193 +40855,193 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: .LBB53_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v5, v56
; SI-NEXT: v_cvt_f16_f32_e32 v6, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v45
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14
; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; SI-NEXT: v_or_b32_e32 v6, v14, v6
+; SI-NEXT: v_or_b32_e32 v6, v17, v6
; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen
; SI-NEXT: v_cvt_f16_f32_e32 v5, v44
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v43
-; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v42
; SI-NEXT: v_cvt_f16_f32_e32 v6, v41
-; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v55
; SI-NEXT: v_cvt_f16_f32_e32 v6, v40
-; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v53
; SI-NEXT: v_cvt_f16_f32_e32 v6, v54
-; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v51
; SI-NEXT: v_cvt_f16_f32_e32 v6, v52
-; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v49
; SI-NEXT: v_cvt_f16_f32_e32 v6, v50
-; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v39
; SI-NEXT: v_cvt_f16_f32_e32 v6, v15
-; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v37
; SI-NEXT: v_cvt_f16_f32_e32 v6, v38
-; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v35
; SI-NEXT: v_cvt_f16_f32_e32 v6, v36
-; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v32
; SI-NEXT: v_cvt_f16_f32_e32 v6, v3
-; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v30
; SI-NEXT: v_cvt_f16_f32_e32 v6, v12
-; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v28
; SI-NEXT: v_cvt_f16_f32_e32 v6, v31
-; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v26
; SI-NEXT: v_cvt_f16_f32_e32 v6, v11
-; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v24
; SI-NEXT: v_cvt_f16_f32_e32 v6, v27
-; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0
+; SI-NEXT: v_add_i32_e32 v14, vcc, 60, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v4
+; SI-NEXT: v_add_i32_e32 v14, vcc, 64, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v23
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v23
+; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v8
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v19
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v19
+; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v60
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v58
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v16
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v48
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v59
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v34
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v58
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v33
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v1, v3, v1
+; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v25
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
; SI-NEXT: v_cvt_f16_f32_e32 v2, v57
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -41094,23 +41105,23 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: s_branch .LBB53_2
;
; VI-LABEL: bitcast_v14f64_to_v56f16_scalar:
@@ -43419,15 +43430,15 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v10
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v12
; SI-NEXT: v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
@@ -43520,9 +43531,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v49, v2
+; SI-NEXT: v_mov_b32_e32 v63, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -43531,9 +43542,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v61, v44
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_mov_b32_e32 v39, v11
+; SI-NEXT: v_mov_b32_e32 v48, v11
; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_mov_b32_e32 v33, v10
+; SI-NEXT: v_mov_b32_e32 v35, v10
; SI-NEXT: v_or_b32_e32 v3, v10, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_or_b32_e32 v5, v8, v5
@@ -43543,7 +43554,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
@@ -43558,9 +43569,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_mov_b32_e32 v40, v56
; SI-NEXT: v_or_b32_e32 v10, v56, v10
; SI-NEXT: v_or_b32_e32 v11, v45, v11
-; SI-NEXT: v_or_b32_e32 v12, v38, v12
-; SI-NEXT: v_or_b32_e32 v13, v36, v13
-; SI-NEXT: v_or_b32_e32 v14, v35, v14
+; SI-NEXT: v_or_b32_e32 v12, v33, v12
+; SI-NEXT: v_or_b32_e32 v13, v38, v13
+; SI-NEXT: v_or_b32_e32 v14, v36, v14
; SI-NEXT: v_or_b32_e32 v15, v32, v15
; SI-NEXT: v_or_b32_e32 v17, v37, v17
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -43622,10 +43633,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v35
; SI-NEXT: v_cvt_f32_f16_e32 v8, v47
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -43635,7 +43646,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v49
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
@@ -43653,9 +43664,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v36
; SI-NEXT: v_cvt_f32_f16_e32 v17, v32
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
@@ -43768,7 +43779,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v39
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
@@ -43908,10 +43919,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB55_4:
-; SI-NEXT: v_mov_b32_e32 v39, v11
-; SI-NEXT: v_mov_b32_e32 v33, v10
-; SI-NEXT: v_mov_b32_e32 v49, v2
-; SI-NEXT: v_mov_b32_e32 v48, v3
+; SI-NEXT: v_mov_b32_e32 v48, v11
+; SI-NEXT: v_mov_b32_e32 v35, v10
+; SI-NEXT: v_mov_b32_e32 v63, v2
+; SI-NEXT: v_mov_b32_e32 v49, v3
; SI-NEXT: v_mov_b32_e32 v52, v37
; SI-NEXT: v_mov_b32_e32 v37, v29
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -46178,9 +46189,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4
-; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(4)
@@ -46226,7 +46236,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB57_2
@@ -46252,33 +46262,35 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v33, s21
; SI-NEXT: v_cvt_f32_f16_e32 v54, v29
; SI-NEXT: v_cvt_f32_f16_e32 v41, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v37
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v56
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v35
-; SI-NEXT: v_mov_b32_e32 v47, v34
+; SI-NEXT: v_mov_b32_e32 v57, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v36
+; SI-NEXT: v_mov_b32_e32 v56, v35
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s23
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v35
+; SI-NEXT: v_mov_b32_e32 v47, v34
; SI-NEXT: v_cvt_f32_f16_e32 v53, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v31
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s24
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v31
; SI-NEXT: v_cvt_f32_f16_e32 v36, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v62
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v61
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s25
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v61
; SI-NEXT: v_cvt_f32_f16_e32 v37, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v58
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s26
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v58
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s27
@@ -46362,6 +46374,12 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: .LBB57_2:
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: v_mov_b32_e32 v57, v36
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: v_mov_b32_e32 v56, v35
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: v_mov_b32_e32 v47, v34
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
@@ -46449,10 +46467,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; kill: killed $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: .LBB57_3: ; %Flow
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
@@ -46470,6 +46484,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47
+; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v56
+; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v57
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_cvt_f32_f16_e32 v33, s16
; SI-NEXT: s_add_i32 s18, s18, 3
@@ -46512,40 +46528,38 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s25
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32
-; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56
+; SI-NEXT: s_add_i32 s19, s19, 3
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v33, s26
-; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v33, s27
; SI-NEXT: s_add_i32 s17, s17, 3
; SI-NEXT: v_cvt_f32_f16_e32 v38, s17
; SI-NEXT: v_cvt_f32_f16_e32 v39, s19
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v33, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v33, s27
; SI-NEXT: v_cvt_f32_f16_e32 v55, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v47
; SI-NEXT: v_cvt_f32_f16_e32 v53, v46
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v33, s29
+; SI-NEXT: v_cvt_f32_f16_e32 v33, s28
; SI-NEXT: v_cvt_f32_f16_e32 v36, v32
; SI-NEXT: v_cvt_f32_f16_e32 v51, v31
; SI-NEXT: v_cvt_f32_f16_e32 v34, v63
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v62
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v33, s29
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v62
; SI-NEXT: v_cvt_f32_f16_e32 v37, v60
; SI-NEXT: v_cvt_f32_f16_e32 v35, v58
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v61
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1
+; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v57
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
@@ -47480,26 +47494,31 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v10, 16, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v12, 16, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s13, s9
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s10, 3 op_sel_hi:[1,0]
@@ -47513,11 +47532,11 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s7, 3 op_sel_hi:[1,0]
@@ -47546,16 +47565,16 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s3, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v30
@@ -47574,16 +47593,11 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v69.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
@@ -50146,26 +50160,31 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v10, 16, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v12, 16, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s13, s9
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s10 op_sel_hi:[0,1]
@@ -50179,11 +50198,11 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s7 op_sel_hi:[0,1]
@@ -50212,16 +50231,16 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s3 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v30
@@ -50240,16 +50259,11 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v69.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 94ed6276bd051..a5bfff7b0e5d3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -4738,7 +4738,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, 0
; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -4767,12 +4767,12 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s89
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -4830,19 +4830,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
; GFX11-TRUE16-NEXT: .LBB13_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s94
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s93
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s92
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s91
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s90
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77
@@ -4884,11 +4887,11 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB13_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
@@ -6006,8 +6009,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59
@@ -6042,8 +6045,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
@@ -6085,8 +6088,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr32
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr34
+; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr62
@@ -6200,8 +6203,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -7748,39 +7751,43 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -7803,10 +7810,12 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr38
@@ -7816,20 +7825,18 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -7849,18 +7856,14 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB16_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
@@ -7870,52 +7873,57 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v49, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12
; SI-NEXT: v_cvt_f32_f16_e32 v33, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v36, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10
; SI-NEXT: v_cvt_f32_f16_e32 v38, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
; SI-NEXT: v_cvt_f32_f16_e32 v48, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v50, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v32
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v35
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v58, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
@@ -7923,9 +7931,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
; SI-NEXT: v_cvt_f32_f16_e32 v62, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
@@ -7941,43 +7946,42 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v25
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v26
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
; SI-NEXT: v_cvt_f32_f16_e32 v44, v3
; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
@@ -8015,37 +8019,61 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB16_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
+; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v37
+; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v34
; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
-; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v37
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
+; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT: v_mov_b32_e32 v61, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v47
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9
@@ -8056,20 +8084,15 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
-; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
-; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
-; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10
@@ -8079,15 +8102,12 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
@@ -8107,27 +8127,24 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
; SI-NEXT: v_cvt_f32_f16_e32 v44, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v55, v55
; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT: v_mov_b32_e32 v61, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
; SI-NEXT: v_cvt_f32_f16_e32 v62, v62
@@ -8137,46 +8154,41 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v45
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT: v_mov_b32_e32 v37, v27
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_mov_b32_e32 v34, v29
-; SI-NEXT: v_mov_b32_e32 v32, v30
-; SI-NEXT: v_mov_b32_e32 v63, v25
-; SI-NEXT: v_mov_b32_e32 v59, v26
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v57
+; SI-NEXT: v_mov_b32_e32 v39, v28
+; SI-NEXT: v_mov_b32_e32 v37, v29
+; SI-NEXT: v_mov_b32_e32 v35, v30
+; SI-NEXT: v_mov_b32_e32 v63, v26
+; SI-NEXT: v_mov_b32_e32 v59, v27
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v1
; SI-NEXT: .LBB16_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
; SI-NEXT: v_cvt_f16_f32_e32 v3, v46
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -8195,43 +8207,39 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
@@ -8240,7 +8248,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
@@ -8249,7 +8257,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
@@ -8258,7 +8266,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
@@ -8267,7 +8275,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
@@ -8276,7 +8284,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -8285,7 +8293,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v62
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
@@ -8294,7 +8302,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -8303,7 +8311,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v58
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
@@ -8313,8 +8321,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8324,8 +8332,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8335,8 +8343,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8346,8 +8354,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8357,8 +8365,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8368,8 +8376,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -8378,62 +8386,66 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v39
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v35
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -10177,7 +10189,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, 0
; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -10206,12 +10218,12 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s89
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3
; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
@@ -10269,19 +10281,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
; GFX11-TRUE16-NEXT: .LBB17_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s94
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s93
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s92
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s91
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s90
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77
@@ -10323,11 +10338,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB17_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
@@ -12219,9 +12234,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v10, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
@@ -12300,9 +12315,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v10, v2
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
@@ -12330,13 +12345,13 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
+; SI-NEXT: v_or_b32_e32 v3, v11, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
@@ -12501,7 +12516,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -12523,7 +12538,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: s_waitcnt vmcnt(11)
@@ -18557,8 +18572,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59
@@ -18593,8 +18608,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
@@ -18636,8 +18651,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr32
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr34
+; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr62
@@ -18751,8 +18766,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -20299,39 +20314,43 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -20354,10 +20373,12 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr38
@@ -20367,20 +20388,18 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -20400,18 +20419,14 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB32_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
@@ -20421,52 +20436,57 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v49, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12
; SI-NEXT: v_cvt_f32_f16_e32 v33, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v36, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10
; SI-NEXT: v_cvt_f32_f16_e32 v38, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
; SI-NEXT: v_cvt_f32_f16_e32 v48, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v50, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v32
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v35
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v58, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
@@ -20474,9 +20494,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
; SI-NEXT: v_cvt_f32_f16_e32 v62, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
@@ -20492,43 +20509,42 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v25
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v26
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
; SI-NEXT: v_cvt_f32_f16_e32 v44, v3
; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
@@ -20566,37 +20582,61 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB32_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v37
+; SI-NEXT: v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
; SI-NEXT: v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21
; SI-NEXT: v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v34
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20
; SI-NEXT: v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT: v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v24, 1.0, v24
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
-; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v37
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT: v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT: v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT: v_mov_b32_e32 v61, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_add_f32_e32 v5, 1.0, v5
; SI-NEXT: v_add_f32_e32 v6, 1.0, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v47
+; SI-NEXT: v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_add_f32_e32 v5, 1.0, v5
; SI-NEXT: v_add_f32_e32 v7, 1.0, v7
; SI-NEXT: v_add_f32_e32 v8, 1.0, v8
; SI-NEXT: v_add_f32_e32 v9, 1.0, v9
@@ -20607,20 +20647,15 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v14, 1.0, v14
; SI-NEXT: v_add_f32_e32 v15, 1.0, v15
; SI-NEXT: v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT: v_add_f32_e32 v24, 1.0, v24
; SI-NEXT: v_add_f32_e32 v25, 1.0, v25
; SI-NEXT: v_add_f32_e32 v26, 1.0, v26
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10
@@ -20630,15 +20665,12 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
@@ -20664,70 +20696,62 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
; SI-NEXT: v_cvt_f32_f16_e32 v44, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v55, v55
; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT: v_mov_b32_e32 v61, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
; SI-NEXT: v_cvt_f32_f16_e32 v62, v62
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v45
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT: v_mov_b32_e32 v37, v27
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_mov_b32_e32 v34, v29
-; SI-NEXT: v_mov_b32_e32 v32, v30
-; SI-NEXT: v_mov_b32_e32 v63, v25
-; SI-NEXT: v_mov_b32_e32 v59, v26
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v57
+; SI-NEXT: v_mov_b32_e32 v39, v28
+; SI-NEXT: v_mov_b32_e32 v37, v29
+; SI-NEXT: v_mov_b32_e32 v35, v30
+; SI-NEXT: v_mov_b32_e32 v63, v26
+; SI-NEXT: v_mov_b32_e32 v59, v27
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v1
; SI-NEXT: .LBB32_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
; SI-NEXT: v_cvt_f16_f32_e32 v3, v46
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -20746,43 +20770,39 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
@@ -20791,7 +20811,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
@@ -20800,7 +20820,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
@@ -20809,7 +20829,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
@@ -20818,7 +20838,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
@@ -20827,7 +20847,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -20836,7 +20856,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v62
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
@@ -20845,7 +20865,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -20854,7 +20874,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v58
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
@@ -20864,8 +20884,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -20875,8 +20895,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -20886,8 +20906,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -20897,8 +20917,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -20908,8 +20928,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -20919,8 +20939,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -20929,62 +20949,66 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v39
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v35
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -24752,9 +24776,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v10, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
@@ -24833,9 +24857,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v10, v2
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
@@ -24863,13 +24887,13 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
+; SI-NEXT: v_or_b32_e32 v3, v11, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
@@ -25034,7 +25058,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -25056,7 +25080,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: s_waitcnt vmcnt(11)
@@ -28942,7 +28966,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, 0
; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -28971,12 +28995,12 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s89
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3
; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -29034,19 +29058,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
; GFX11-TRUE16-NEXT: .LBB41_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s94
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s93
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s92
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s91
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s90
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77
@@ -29088,11 +29115,11 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB41_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
@@ -30210,8 +30237,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59
@@ -30246,8 +30273,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
@@ -30289,8 +30316,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr32
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr34
+; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr62
@@ -30404,8 +30431,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -31952,39 +31979,43 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -32007,9 +32038,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr48
@@ -32020,20 +32053,18 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -32053,18 +32084,14 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB44_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
; SI-NEXT: v_cvt_f32_f16_e32 v55, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
@@ -32074,52 +32101,57 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v49, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12
; SI-NEXT: v_cvt_f32_f16_e32 v33, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v36, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10
; SI-NEXT: v_cvt_f32_f16_e32 v38, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
; SI-NEXT: v_cvt_f32_f16_e32 v48, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v50, v32
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7
; SI-NEXT: v_cvt_f32_f16_e32 v52, v32
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v58, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
@@ -32127,11 +32159,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14
; SI-NEXT: v_cvt_f32_f16_e32 v62, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
@@ -32145,43 +32172,44 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v29
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v25
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v26
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
; SI-NEXT: v_cvt_f32_f16_e32 v44, v3
; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
@@ -32222,9 +32250,9 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
@@ -32242,38 +32270,57 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21
; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2
+; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v37
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22
; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21
; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v35
; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
-; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
-; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1
-; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v34
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24
; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v39
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v32
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v63
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v43
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT: v_mov_b32_e32 v61, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v47
+; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc
+; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
+; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5
; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8
; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9
@@ -32284,15 +32331,12 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
@@ -32318,21 +32362,15 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v4
; SI-NEXT: v_cvt_f32_f16_e32 v44, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v55, v55
; SI-NEXT: v_cvt_f32_f16_e32 v53, v53
; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT: v_mov_b32_e32 v61, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v59
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
; SI-NEXT: v_cvt_f32_f16_e32 v60, v60
; SI-NEXT: v_cvt_f32_f16_e32 v62, v62
@@ -32343,45 +32381,43 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v45
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v47
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT: v_mov_b32_e32 v37, v27
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_mov_b32_e32 v34, v29
-; SI-NEXT: v_mov_b32_e32 v32, v30
-; SI-NEXT: v_mov_b32_e32 v63, v25
-; SI-NEXT: v_mov_b32_e32 v59, v26
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v57
+; SI-NEXT: v_mov_b32_e32 v39, v28
+; SI-NEXT: v_mov_b32_e32 v37, v29
+; SI-NEXT: v_mov_b32_e32 v35, v30
+; SI-NEXT: v_mov_b32_e32 v63, v26
+; SI-NEXT: v_mov_b32_e32 v59, v27
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v1
; SI-NEXT: .LBB44_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v57
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v56
; SI-NEXT: v_cvt_f16_f32_e32 v3, v46
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -32400,34 +32436,30 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v42
; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
@@ -32436,7 +32468,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v50
; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
@@ -32445,7 +32477,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
@@ -32454,7 +32486,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v38
; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
@@ -32463,7 +32495,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
@@ -32472,7 +32504,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
@@ -32481,7 +32513,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
@@ -32490,7 +32522,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v62
; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
@@ -32499,7 +32531,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
@@ -32508,7 +32540,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v58
; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
@@ -32518,8 +32550,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32529,8 +32561,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32540,8 +32572,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32551,8 +32583,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32562,8 +32594,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32573,8 +32605,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -32583,62 +32615,66 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v63
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v59
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v39
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v37
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v35
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -34398,7 +34434,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, 0
; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
@@ -34427,12 +34463,12 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s89
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3
; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3
@@ -34490,19 +34526,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s0, 16
; GFX11-TRUE16-NEXT: .LBB45_3: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s94
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s93
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s92
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s91
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s89
+; GFX11-TRUE16-NEXT: s_mov_b32 s89, s90
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77
@@ -34544,11 +34583,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB45_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16
@@ -36440,9 +36479,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v10, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
@@ -36521,9 +36560,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v10, v2
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
@@ -36551,13 +36590,13 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
+; SI-NEXT: v_or_b32_e32 v3, v11, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
@@ -36722,7 +36761,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -36744,7 +36783,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: s_waitcnt vmcnt(11)
@@ -40866,8 +40905,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47
-; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56
+; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59
@@ -40902,8 +40941,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6
@@ -40945,8 +40984,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr32
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr34
+; GFX9-NEXT: ; implicit-def: $vgpr63
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr62
@@ -41060,8 +41099,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6
; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6
; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6
-; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6
+; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6
+; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6
; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6
; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6
; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -43915,29 +43954,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: s_cbranch_scc0 .LBB53_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshr_b32 s44, s5, 16
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_cvt_f32_f16_e32 v59, s44
-; SI-NEXT: s_lshr_b32 s44, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v36, s44
+; SI-NEXT: s_lshr_b32 s44, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v33, s44
; SI-NEXT: s_lshr_b32 s44, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v13, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v29, s44
; SI-NEXT: s_lshr_b32 s44, s6, 16
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v61, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v25, s44
; SI-NEXT: s_lshr_b32 s44, s9, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v33, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v18, s44
; SI-NEXT: s_lshr_b32 s44, s8, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v48, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s44
; SI-NEXT: s_lshr_b32 s44, s11, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v25, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s44
; SI-NEXT: s_lshr_b32 s44, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v1, s44
; SI-NEXT: s_lshr_b32 s44, s13, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v48, s44
; SI-NEXT: s_lshr_b32 s44, s12, 16
; SI-NEXT: v_cvt_f32_f16_e32 v17, s44
; SI-NEXT: s_lshr_b32 s44, s15, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v43, s44
+; SI-NEXT: v_cvt_f32_f16_e32 v45, s44
; SI-NEXT: s_lshr_b32 s44, s14, 16
; SI-NEXT: v_cvt_f32_f16_e32 v22, s44
; SI-NEXT: s_lshr_b32 s44, s41, 16
@@ -43973,21 +44010,24 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: s_lshr_b32 s44, s18, 16
; SI-NEXT: v_cvt_f32_f16_e32 v56, s44
; SI-NEXT: s_lshr_b32 s44, s17, 16
+; SI-NEXT: s_waitcnt expcnt(5)
; SI-NEXT: v_cvt_f32_f16_e32 v58, s44
; SI-NEXT: s_lshr_b32 s44, s16, 16
+; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v60, s44
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v18, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v13, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s4
; SI-NEXT: v_cvt_f32_f16_e32 v19, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v29, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v52, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v40, s8
-; SI-NEXT: v_cvt_f32_f16_e32 v45, s11
-; SI-NEXT: v_cvt_f32_f16_e32 v47, s10
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_cvt_f32_f16_e32 v61, s6
; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v62, s13
+; SI-NEXT: v_cvt_f32_f16_e32 v62, s9
+; SI-NEXT: v_cvt_f32_f16_e32 v52, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v40, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v43, s10
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v63, s12
+; SI-NEXT: v_cvt_f32_f16_e32 v63, s13
+; SI-NEXT: v_cvt_f32_f16_e32 v21, s12
; SI-NEXT: v_cvt_f32_f16_e32 v23, s15
; SI-NEXT: v_cvt_f32_f16_e32 v4, s14
; SI-NEXT: v_cvt_f32_f16_e32 v27, s41
@@ -44005,9 +44045,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v42, s21
; SI-NEXT: v_cvt_f32_f16_e32 v8, s20
; SI-NEXT: v_cvt_f32_f16_e32 v20, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v21, s18
+; SI-NEXT: v_cvt_f32_f16_e32 v47, s18
; SI-NEXT: v_cvt_f32_f16_e32 v57, s17
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v59, s16
; SI-NEXT: s_cbranch_execnz .LBB53_3
; SI-NEXT: .LBB53_2: ; %cmp.true
; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0
@@ -44028,104 +44068,107 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42
-; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53
; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0
+; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v54
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v62, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v20
; SI-NEXT: v_cvt_f32_f16_e32 v20, v58
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v57
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v57, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v15
-; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0
; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v53
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v34
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v55
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8
; SI-NEXT: v_cvt_f32_f16_e32 v19, v8
; SI-NEXT: v_cvt_f32_f16_e32 v8, v41
+; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0
+; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v38
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v15
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v44
; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0
-; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v12
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v62, v12
; SI-NEXT: v_cvt_f32_f16_e32 v12, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v44
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v46
; SI-NEXT: v_add_f64 v[49:50], s[24:25], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v16
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v50
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v16
; SI-NEXT: v_cvt_f32_f16_e32 v16, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v46
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v5
; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0
; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v22
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v5
-; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v22
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v7
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v11
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v22
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v26
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v53
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v61
; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v48, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v59
-; SI-NEXT: v_mov_b32_e32 v59, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v18
+; SI-NEXT: v_mov_b32_e32 v18, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v45
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v13
-; SI-NEXT: v_mov_b32_e32 v13, v26
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v10
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v26, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v60
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v55, v6
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -44144,9 +44187,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: v_cvt_f32_f16_e32 v60, v6
; SI-NEXT: .LBB53_3: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v6, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v59
; SI-NEXT: v_cvt_f16_f32_e32 v10, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v9, v6
; SI-NEXT: v_cvt_f16_f32_e32 v9, v58
@@ -44158,7 +44201,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
; SI-NEXT: v_cvt_f16_f32_e32 v6, v56
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v47
; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v9, v6
@@ -44228,125 +44271,125 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v34
-; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v3
+; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v6, v9, v6
+; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v35
-; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v35
+; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v9, v6
+; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v12
-; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v12
+; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v9, v6
+; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v31
-; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v31
+; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v9, v6
+; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v11
-; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v11
+; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v9, v6
+; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v27
-; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v27
+; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v9, v6
+; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v22
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
+; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v4, v4, v6
+; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v43
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v23
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v23
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v6, v4
+; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v63
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v21
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v6, v4
+; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v62
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v63
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v6, v4
+; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v43
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0
+; SI-NEXT: v_or_b32_e32 v1, v4, v1
+; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v45
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v40
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v1, v4, v1
+; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v40
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v62
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v61
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v29
; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v14
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v1
@@ -44370,11 +44413,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB53_4:
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr46
@@ -44409,27 +44452,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: s_branch .LBB53_2
;
; VI-LABEL: bitcast_v15f64_to_v60f16_scalar:
@@ -46965,9 +47008,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v10, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
@@ -47046,9 +47089,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v10, v2
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
; SI-NEXT: v_mov_b32_e32 v33, v32
@@ -47076,13 +47119,13 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
+; SI-NEXT: v_or_b32_e32 v3, v11, v3
; SI-NEXT: v_or_b32_e32 v4, v9, v4
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
@@ -47247,7 +47290,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -47269,7 +47312,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: s_waitcnt vmcnt(11)
@@ -50025,7 +50068,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_cbranch_scc0 .LBB57_2
@@ -50047,118 +50090,123 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s18
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
+; SI-NEXT: v_mov_b32_e32 v48, v39
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v4
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s19
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v5
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s20
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
+; SI-NEXT: v_mov_b32_e32 v39, v38
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v6
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s21
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v38
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v7
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_mov_b32_e32 v38, v37
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v8
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s23
-; SI-NEXT: v_mov_b32_e32 v35, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v9
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v34
+; SI-NEXT: v_mov_b32_e32 v37, v36
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v10
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s25
-; SI-NEXT: v_mov_b32_e32 v34, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v36
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v11
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
+; SI-NEXT: v_mov_b32_e32 v36, v35
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v12
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s27
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v13
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
+; SI-NEXT: v_mov_b32_e32 v35, v34
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v14
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s29
-; SI-NEXT: v_mov_b32_e32 v32, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v34
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v15
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v50
+; SI-NEXT: v_mov_b32_e32 v34, v33
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v16
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v17
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
+; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v18
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
+; SI-NEXT: v_mov_b32_e32 v32, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v50
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v19
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v61
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v20
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v58
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v21
@@ -50187,6 +50235,31 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr49
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v48, v39
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; kill: killed $vgpr49
+; SI-NEXT: v_mov_b32_e32 v39, v38
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; kill: killed $vgpr49
+; SI-NEXT: v_mov_b32_e32 v38, v37
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; kill: killed $vgpr49
+; SI-NEXT: v_mov_b32_e32 v37, v36
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; kill: killed $vgpr49
+; SI-NEXT: v_mov_b32_e32 v36, v35
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; kill: killed $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; kill: killed $vgpr49
; SI-NEXT: v_mov_b32_e32 v35, v34
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
@@ -50254,26 +50327,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: ; kill: killed $vgpr49
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -50309,6 +50362,11 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33
; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35
+; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v36
+; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v37
+; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v38
+; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v39
+; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v48
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_cvt_f32_f16_e32 v49, s16
; SI-NEXT: s_add_i32 s17, s17, 3
@@ -50351,50 +50409,45 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s23
; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63
-; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s24
-; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37
-; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v36
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s25
-; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
-; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v48
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v63
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v58
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v58
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -51390,53 +51443,62 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v12, 16, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v13, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v12, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v13, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v12, 16, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s15, s11
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s10, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s9, 3 op_sel_hi:[1,0]
@@ -51448,13 +51510,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6
; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17
; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s7, 3 op_sel_hi:[1,0]
@@ -51481,18 +51543,16 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s3, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v37
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v36
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v32
@@ -51508,21 +51568,14 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v83.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v68.l
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
@@ -51545,9 +51598,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s45
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s43
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s15
@@ -51596,8 +51649,8 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v27, 16, v17
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v25, 16, v18
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1
@@ -51939,48 +51992,48 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v59, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v7
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v13
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v18
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
; SI-NEXT: v_cvt_f16_f32_e32 v48, v11
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
; SI-NEXT: v_cvt_f16_f32_e32 v38, v16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v11, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v24
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_cvt_f16_f32_e32 v24, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v27
; SI-NEXT: v_cvt_f16_f32_e32 v27, v28
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v28, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51
@@ -51990,7 +52043,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v44, v62
; SI-NEXT: v_cvt_f16_f32_e32 v18, v63
; SI-NEXT: v_cvt_f16_f32_e32 v19, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v36
; SI-NEXT: v_cvt_f16_f32_e32 v43, v39
; SI-NEXT: v_cvt_f16_f32_e32 v15, v50
; SI-NEXT: v_cvt_f16_f32_e32 v16, v54
@@ -52004,28 +52057,26 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v56
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v57
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v58
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v31
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f16_f32_e32 v45, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v58
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v31, v34
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v35
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v35
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v37
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v42, v3
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -52033,21 +52084,19 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v35
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v37
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v45
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v46
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: v_mov_b32_e32 v47, v21
-; SI-NEXT: v_mov_b32_e32 v56, v17
-; SI-NEXT: v_mov_b32_e32 v57, v6
+; SI-NEXT: v_mov_b32_e32 v56, v6
; SI-NEXT: v_mov_b32_e32 v58, v7
-; SI-NEXT: v_mov_b32_e32 v59, v33
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.true
@@ -52055,34 +52104,34 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v57
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v7
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_or_b32_e32 v3, v3, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10
-; SI-NEXT: v_or_b32_e32 v9, v9, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT: v_or_b32_e32 v31, v31, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62
-; SI-NEXT: v_or_b32_e32 v63, v6, v34
+; SI-NEXT: v_or_b32_e32 v3, v3, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; SI-NEXT: v_or_b32_e32 v31, v31, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v63
+; SI-NEXT: v_or_b32_e32 v57, v6, v35
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
@@ -52098,7 +52147,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
@@ -52109,114 +52158,119 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_or_b32_e32 v12, v12, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_or_b32_e32 v12, v12, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT: v_or_b32_e32 v15, v15, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19
+; SI-NEXT: v_or_b32_e32 v15, v15, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_or_b32_e32 v18, v18, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_or_b32_e32 v22, v22, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27
+; SI-NEXT: v_or_b32_e32 v26, v26, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v8
+; SI-NEXT: v_or_b32_e32 v11, v11, v35
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v20
+; SI-NEXT: v_or_b32_e32 v1, v1, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v59
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v35
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_or_b32_e32 v18, v18, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_or_b32_e32 v22, v22, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27
-; SI-NEXT: v_or_b32_e32 v26, v26, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5
-; SI-NEXT: v_or_b32_e32 v11, v11, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; SI-NEXT: v_or_b32_e32 v2, v2, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v38
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v38, v34
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_or_b32_e32 v37, v34, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v49
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v48
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v34
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT: v_or_b32_e32 v48, v34, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT: v_or_b32_e32 v2, v2, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v45, v45
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v35
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v49
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v48, v35, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v52
; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v34
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v35
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v53
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT: v_or_b32_e32 v52, v34, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v55
+; SI-NEXT: v_or_b32_e32 v52, v35, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v55
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v34
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v35
+; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: v_or_b32_e32 v55, v34, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v60
+; SI-NEXT: v_or_b32_e32 v55, v35, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v35, v60
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v6
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT: v_or_b32_e32 v6, v35, v34
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT: v_or_b32_e32 v6, v37, v35
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
@@ -52227,7 +52281,10 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
@@ -52235,158 +52292,145 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41
; SI-NEXT: v_or_b32_e32 v25, v25, v24
; SI-NEXT: v_or_b32_e32 v29, v29, v28
; SI-NEXT: v_or_b32_e32 v54, v54, v51
; SI-NEXT: v_or_b32_e32 v50, v50, v30
+; SI-NEXT: v_or_b32_e32 v36, v36, v32
+; SI-NEXT: v_or_b32_e32 v33, v33, v42
; SI-NEXT: v_or_b32_e32 v39, v39, v41
-; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16
+; SI-NEXT: v_alignbit_b32 v60, v55, v35, 16
+; SI-NEXT: v_alignbit_b32 v59, v52, v45, 16
; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16
; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16
; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16
-; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16
+; SI-NEXT: v_alignbit_b32 v30, v57, v30, 16
+; SI-NEXT: v_alignbit_b32 v32, v31, v32, 16
+; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16
; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v6
-; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_or_b32_e32 v6, v35, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v6
+; SI-NEXT: v_or_b32_e32 v6, v37, v45
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16
+; SI-NEXT: v_or_b32_e32 v6, v37, v5
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
-; SI-NEXT: v_or_b32_e32 v58, v35, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v56
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16
-; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT: v_or_b32_e32 v57, v46, v14
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT: v_alignbit_b32 v5, v48, v5, 16
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_or_b32_e32 v58, v46, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f32_f16_e32 v46, v47
-; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_or_b32_e32 v56, v35, v17
-; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v59
+; SI-NEXT: v_alignbit_b32 v14, v2, v14, 16
+; SI-NEXT: v_or_b32_e32 v56, v37, v17
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v62
+; SI-NEXT: v_alignbit_b32 v17, v1, v17, 16
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT: v_or_b32_e32 v59, v46, v43
+; SI-NEXT: v_or_b32_e32 v62, v46, v43
; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v21, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_or_b32_e32 v47, v35, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v35, v44
+; SI-NEXT: v_or_b32_e32 v47, v37, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v44
; SI-NEXT: v_cvt_f32_f16_e32 v44, v61
-; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
-; SI-NEXT: v_or_b32_e32 v61, v44, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v45
-; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44
-; SI-NEXT: v_or_b32_e32 v36, v36, v45
-; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16
-; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v6
-; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT: v_or_b32_e32 v6, v33, v42
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16
-; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT: v_or_b32_e32 v61, v44, v37
+; SI-NEXT: v_alignbit_b32 v44, v18, v37, 16
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v60
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v34, v34, v35
-; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v35, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v35, v35, v37
+; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40
-; SI-NEXT: v_or_b32_e32 v34, v34, v35
-; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v35, 0xffff, v55
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40
+; SI-NEXT: v_or_b32_e32 v35, v35, v37
+; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v59
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v1, v34, v1
-; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53
-; SI-NEXT: v_or_b32_e32 v1, v1, v34
-; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58
-; SI-NEXT: v_or_b32_e32 v1, v1, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49
-; SI-NEXT: v_or_b32_e32 v1, v1, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14
-; SI-NEXT: v_or_b32_e32 v1, v1, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v35, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v35, v35, v37
+; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v35, 0xffff, v52
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v53
+; SI-NEXT: v_or_b32_e32 v35, v35, v37
+; SI-NEXT: v_add_i32_e32 v37, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_and_b32_e32 v35, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v5, v35, v5
+; SI-NEXT: v_add_i32_e32 v35, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v5, v35, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38
-; SI-NEXT: v_or_b32_e32 v1, v1, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v48
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49
+; SI-NEXT: v_or_b32_e32 v5, v5, v35
+; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v5, v35, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v58
+; SI-NEXT: v_or_b32_e32 v5, v5, v14
+; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; SI-NEXT: v_or_b32_e32 v1, v1, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
@@ -52401,7 +52445,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -52442,7 +52486,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
@@ -52472,28 +52516,26 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -53188,9 +53230,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
; SI-NEXT: v_cvt_f16_f32_e32 v49, v6
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v7
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v8
; SI-NEXT: v_cvt_f16_f32_e32 v37, v9
; SI-NEXT: v_cvt_f16_f32_e32 v38, v10
; SI-NEXT: v_cvt_f16_f32_e32 v5, v11
@@ -53275,8 +53317,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v33, v11
; SI-NEXT: v_mov_b32_e32 v11, v8
; SI-NEXT: v_mov_b32_e32 v8, v5
-; SI-NEXT: v_mov_b32_e32 v5, v42
-; SI-NEXT: v_mov_b32_e32 v42, v1
+; SI-NEXT: v_mov_b32_e32 v5, v1
; SI-NEXT: s_cbranch_vccnz .LBB59_5
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -54285,53 +54326,62 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v12, 16, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v13, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v12, 16, v8
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v13, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v12, 16, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s15, s11
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s10 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s9 op_sel_hi:[0,1]
@@ -54343,13 +54393,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6
; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17
; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s7 op_sel_hi:[0,1]
@@ -54376,18 +54426,16 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s3 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v37
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v36
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v32
@@ -54403,21 +54451,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v83.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v68.l
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
@@ -54440,9 +54481,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s45
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s43
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s42
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s41
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s15
@@ -54491,8 +54532,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v18
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v27, 16, v17
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v25, 16, v18
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index f888f4f3b1407..8a2602e668a15 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -7895,11 +7895,11 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4
@@ -7969,6 +7969,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
; GFX11-TRUE16-NEXT: .LBB38_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v14.l
@@ -11041,62 +11042,57 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) {
; VI-LABEL: bitcast_v6i16_to_v12i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_mov_b32_e32 v14, v1
+; VI-NEXT: v_mov_b32_e32 v13, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0
-; VI-NEXT: ; implicit-def: $vgpr16
-; VI-NEXT: ; implicit-def: $vgpr15
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: ; implicit-def: $vgpr3
-; VI-NEXT: ; implicit-def: $vgpr14
; VI-NEXT: ; implicit-def: $vgpr5
; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: ; implicit-def: $vgpr8
; VI-NEXT: ; implicit-def: $vgpr9
; VI-NEXT: ; implicit-def: $vgpr11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT: s_cbranch_execz .LBB46_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v16, v0
-; VI-NEXT: v_mov_b32_e32 v14, v1
-; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: ; implicit-def: $vgpr1
-; VI-NEXT: ; implicit-def: $vgpr2
-; VI-NEXT: .LBB46_2: ; %Flow
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14]
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13
+; VI-NEXT: ; %bb.2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB46_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v3, 3
-; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v14, 3, v1
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; VI-NEXT: v_add_u16_e32 v16, 3, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13
-; VI-NEXT: v_or_b32_e32 v1, v14, v1
-; VI-NEXT: v_or_b32_e32 v0, v16, v0
-; VI-NEXT: v_add_u16_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_u16_e32 v8, 3, v2
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
+; VI-NEXT: v_add_u16_sdwa v6, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_e32 v15, 3, v14
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; VI-NEXT: v_add_u16_sdwa v2, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v10, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v1, v15, v0
+; VI-NEXT: v_add_u16_e32 v14, 3, v13
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; VI-NEXT: v_add_u16_e32 v16, 3, v8
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10
+; VI-NEXT: v_or_b32_e32 v0, v14, v0
+; VI-NEXT: v_or_b32_e32 v7, v16, v3
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_or_b32_e32 v2, v8, v2
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3]
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8]
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; VI-NEXT: v_bfe_u32 v7, v6, 8, 8
+; VI-NEXT: v_mov_b32_e32 v13, v14
+; VI-NEXT: v_mov_b32_e32 v14, v15
+; VI-NEXT: v_mov_b32_e32 v8, v16
; VI-NEXT: .LBB46_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, v16
-; VI-NEXT: v_mov_b32_e32 v1, v15
-; VI-NEXT: v_mov_b32_e32 v2, v13
+; VI-NEXT: v_mov_b32_e32 v0, v13
; VI-NEXT: v_mov_b32_e32 v4, v14
; VI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 70211c302829c..c4c977f86c299 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -7632,8 +7632,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7LESS-NEXT: s_load_dword s10, s[4:5], 0xd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX7LESS-NEXT: ; implicit-def: $vgpr0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB13_4
@@ -7653,27 +7653,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX7LESS-NEXT: s_not_b32 s13, s12
; GFX7LESS-NEXT: s_lshl_b32 s14, s6, s11
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s15
; GFX7LESS-NEXT: s_mov_b32 s6, -1
; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1
-; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_and_b32_e32 v2, s13, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v4
+; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0
+; GFX7LESS-NEXT: v_and_b32_e32 v1, s13, v4
+; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX7LESS-NEXT: .LBB13_4: ; %Flow
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -7683,7 +7684,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xff
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0
+; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v2, v0
; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -7693,8 +7694,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX8-NEXT: s_load_dword s10, s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8-NEXT: s_cbranch_execz .LBB13_4
@@ -7713,27 +7714,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX8-NEXT: s_lshl_b32 s14, s2, s11
; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1
-; GFX8-NEXT: v_and_b32_e32 v2, s13, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v4
+; GFX8-NEXT: v_and_b32_e32 v1, s13, v4
; GFX8-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_or_b32_e32 v3, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_cbranch_execnz .LBB13_2
; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX8-NEXT: .LBB13_4: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -7742,7 +7743,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u16 v0, s10, v4, v0
+; GFX8-NEXT: v_mad_u16 v0, s10, v2, v0
; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -7752,8 +7753,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX9-NEXT: s_load_dword s10, s[4:5], 0x34
; GFX9-NEXT: s_mov_b64 s[6:7], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_4
@@ -7772,26 +7773,26 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX9-NEXT: s_lshl_b32 s14, s2, s11
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_u32_e32 v0, s14, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-NEXT: v_add_u32_e32 v0, s14, v4
; GFX9-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX9-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX9-NEXT: .LBB13_4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
@@ -7800,7 +7801,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v4, v0
+; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v2, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -7811,9 +7812,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1064-NEXT: s_load_dword s10, s[4:5], 0x34
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_4
; GFX1064-NEXT: ; %bb.1:
@@ -7832,32 +7833,32 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1064-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v4
; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX1064-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1064-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1064-NEXT: .LBB13_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2
+; GFX1064-NEXT: v_mad_u16 v0, s10, v2, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
@@ -7869,9 +7870,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1032-NEXT: s_load_dword s8, s[4:5], 0x34
; GFX1032-NEXT: s_mov_b32 s6, exec_lo
; GFX1032-NEXT: s_mov_b32 s10, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_4
; GFX1032-NEXT: ; %bb.1:
@@ -7889,32 +7890,32 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1032-NEXT: s_lshl_b32 s12, s6, s2
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: v_mov_b32_e32 v0, s7
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1
+; GFX1032-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0
-; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX1032-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1032-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1032-NEXT: s_or_b32 s10, vcc_lo, s10
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s10
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1032-NEXT: .LBB13_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2
+; GFX1032-NEXT: v_mad_u16 v0, s8, v2, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
@@ -7928,9 +7929,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1164-TRUE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
-; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1164-TRUE16-NEXT: ; %bb.1:
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -7949,29 +7950,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1164-TRUE16-NEXT: .LBB13_4: ; %Flow
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -7980,7 +7981,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX1164-TRUE16-NEXT: s_endpgm
@@ -7994,9 +7995,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1164-FAKE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1164-FAKE16-NEXT: ; %bb.1:
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -8014,29 +8015,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1164-FAKE16-NEXT: .LBB13_4: ; %Flow
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8045,7 +8046,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2
+; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2
; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX1164-FAKE16-NEXT: s_endpgm
@@ -8057,11 +8058,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1132-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1132-TRUE16-NEXT: s_mov_b32 s10, 0
-; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1132-TRUE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1132-TRUE16-NEXT: ; %bb.1:
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -8079,27 +8080,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1132-TRUE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1132-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-TRUE16-NEXT: .LBB13_4: ; %Flow
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8108,7 +8109,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX1132-TRUE16-NEXT: s_endpgm
@@ -8120,11 +8121,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1132-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1132-FAKE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1132-FAKE16-NEXT: s_mov_b32 s10, 0
-; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1132-FAKE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1132-FAKE16-NEXT: ; %bb.1:
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -8141,27 +8142,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1132-FAKE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1132-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-FAKE16-NEXT: .LBB13_4: ; %Flow
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8170,7 +8171,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2
+; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2
; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX1132-FAKE16-NEXT: s_endpgm
@@ -8184,9 +8185,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
-; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1264-TRUE16-NEXT: ; %bb.1:
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
@@ -8207,28 +8208,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1264-TRUE16-NEXT: .LBB13_4: ; %Flow
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8238,7 +8240,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1264-TRUE16-NEXT: s_endpgm
@@ -8252,9 +8254,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1264-FAKE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1264-FAKE16-NEXT: ; %bb.1:
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
@@ -8273,28 +8275,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1264-FAKE16-NEXT: .LBB13_4: ; %Flow
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8304,7 +8307,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2
+; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2
; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1264-FAKE16-NEXT: s_endpgm
@@ -8316,11 +8319,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-TRUE16-NEXT: s_mov_b32 s10, 0
-; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1232-TRUE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1232-TRUE16-NEXT: ; %bb.1:
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
@@ -8341,27 +8344,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-TRUE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1232-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-TRUE16-NEXT: .LBB13_4: ; %Flow
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8371,7 +8375,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1232-TRUE16-NEXT: s_endpgm
@@ -8383,11 +8387,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1232-FAKE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-FAKE16-NEXT: s_mov_b32 s10, 0
-; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1232-FAKE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB13_4
; GFX1232-FAKE16-NEXT: ; %bb.1:
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
@@ -8407,27 +8411,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-FAKE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1232-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB13_2
; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-FAKE16-NEXT: .LBB13_4: ; %Flow
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8437,7 +8442,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2
+; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
@@ -8464,27 +8469,28 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX7LESS-NEXT: s_lshl_b32 s2, s3, s10
; GFX7LESS-NEXT: s_not_b32 s3, s11
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-NEXT: s_mov_b32 s6, -1
; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v2
+; GFX7LESS-NEXT: v_or_b32_e32 v1, s2, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -8504,27 +8510,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX8-NEXT: s_lshl_b32 s10, s2, s8
; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_and_b32_e32 v0, s9, v1
-; GFX8-NEXT: v_or_b32_e32 v0, s10, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_and_b32_e32 v0, s9, v2
+; GFX8-NEXT: v_or_b32_e32 v1, s10, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -8544,27 +8550,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX9-NEXT: s_lshl_b32 s10, s2, s8
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_and_b32_e32 v0, s9, v1
-; GFX9-NEXT: v_or_b32_e32 v0, s10, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_and_b32_e32 v0, s9, v2
+; GFX9-NEXT: v_or_b32_e32 v1, s10, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -8586,23 +8592,23 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1064-NEXT: s_not_b32 s9, s2
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT: v_and_or_b32 v1, v2, s9, s10
+; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -8626,23 +8632,23 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1032-NEXT: s_not_b32 s3, s3
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: v_mov_b32_e32 v0, s7
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT: v_and_or_b32 v1, v2, s3, s8
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -8666,25 +8672,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1164-NEXT: s_not_b32 s9, s2
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_and_or_b32 v0, v1, s9, s10
-; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-NEXT: v_and_or_b32 v1, v2, s9, s10
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b8 v0, off, s[0:3], 0
@@ -8708,24 +8715,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1132-NEXT: s_not_b32 s3, s3
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_and_or_b32 v0, v1, s3, s8
-; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-NEXT: v_and_or_b32 v1, v2, s3, s8
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b8 v0, off, s[0:3], 0
@@ -8749,25 +8758,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1264-NEXT: s_not_b32 s9, s2
; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_and_or_b32 v0, v1, s9, s10
-; GFX1264-NEXT: v_mov_b32_e32 v3, v1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-NEXT: v_and_or_b32 v1, v2, s9, s10
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-NEXT: s_cbranch_execnz .LBB14_1
; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -8791,24 +8801,26 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1232-NEXT: s_not_b32 s3, s3
; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_and_or_b32 v0, v1, s3, s8
-; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-NEXT: v_and_or_b32 v1, v2, s3, s8
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1232-NEXT: s_cbranch_execnz .LBB14_1
; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -9334,8 +9346,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7LESS-NEXT: s_load_dword s10, s[4:5], 0xd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX7LESS-NEXT: ; implicit-def: $vgpr0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB16_4
@@ -9355,27 +9367,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX7LESS-NEXT: s_not_b32 s13, s12
; GFX7LESS-NEXT: s_lshl_b32 s14, s6, s11
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s15
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s15
; GFX7LESS-NEXT: s_mov_b32 s6, -1
; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v1
-; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_and_b32_e32 v2, s13, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s14, v4
+; GFX7LESS-NEXT: v_and_b32_e32 v0, s12, v0
+; GFX7LESS-NEXT: v_and_b32_e32 v1, s13, v4
+; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2
; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT: v_bfe_u32 v0, v2, s11, 16
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_bfe_u32 v0, v0, s11, 16
; GFX7LESS-NEXT: .LBB16_4: ; %Flow
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -9385,7 +9398,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xffff
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0
+; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v2, v0
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -9395,8 +9408,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX8-NEXT: s_load_dword s10, s[4:5], 0x34
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8-NEXT: s_cbranch_execz .LBB16_4
@@ -9415,27 +9428,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX8-NEXT: s_lshl_b32 s14, s2, s11
; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v1
-; GFX8-NEXT: v_and_b32_e32 v2, s13, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v4
+; GFX8-NEXT: v_and_b32_e32 v1, s13, v4
; GFX8-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_or_b32_e32 v3, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_cbranch_execnz .LBB16_2
; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX8-NEXT: .LBB16_4: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -9444,7 +9457,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u16 v0, s10, v4, v0
+; GFX8-NEXT: v_mad_u16 v0, s10, v2, v0
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -9454,8 +9467,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX9-NEXT: s_load_dword s10, s[4:5], 0x34
; GFX9-NEXT: s_mov_b64 s[6:7], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9-NEXT: s_cbranch_execz .LBB16_4
@@ -9474,26 +9487,26 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX9-NEXT: s_lshl_b32 s14, s2, s11
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_add_u32_e32 v0, s14, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-NEXT: v_add_u32_e32 v0, s14, v4
; GFX9-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX9-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX9-NEXT: .LBB16_4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -9502,7 +9515,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v4, v0
+; GFX9-NEXT: v_mad_legacy_u16 v0, s10, v2, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -9513,9 +9526,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1064-NEXT: s_load_dword s10, s[4:5], 0x34
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_4
; GFX1064-NEXT: ; %bb.1:
@@ -9534,32 +9547,32 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1064-NEXT: v_mov_b32_e32 v4, v0
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v4
; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX1064-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1064-NEXT: v_mov_b32_e32 v0, v3
+; GFX1064-NEXT: v_mov_b32_e32 v1, v4
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1064-NEXT: .LBB16_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mad_u16 v0, s10, v4, s2
+; GFX1064-NEXT: v_mad_u16 v0, s10, v2, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
@@ -9571,9 +9584,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1032-NEXT: s_load_dword s8, s[4:5], 0x34
; GFX1032-NEXT: s_mov_b32 s6, exec_lo
; GFX1032-NEXT: s_mov_b32 s10, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_4
; GFX1032-NEXT: ; %bb.1:
@@ -9591,32 +9604,32 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1032-NEXT: s_lshl_b32 s12, s6, s2
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: v_mov_b32_e32 v0, s7
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1
+; GFX1032-NEXT: v_mov_b32_e32 v4, v0
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0
-; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX1032-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1032-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-NEXT: v_mov_b32_e32 v1, v4
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1032-NEXT: s_or_b32 s10, vcc_lo, s10
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s10
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1032-NEXT: .LBB16_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mad_u16 v0, s8, v4, s2
+; GFX1032-NEXT: v_mad_u16 v0, s8, v2, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
@@ -9630,9 +9643,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-TRUE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
-; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1164-TRUE16-NEXT: ; %bb.1:
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -9651,29 +9664,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9682,7 +9695,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX1164-TRUE16-NEXT: s_endpgm
@@ -9696,9 +9709,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-FAKE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1164-FAKE16-NEXT: ; %bb.1:
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -9716,29 +9729,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1164-FAKE16-NEXT: .LBB16_4: ; %Flow
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9747,7 +9760,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2
+; GFX1164-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2
; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX1164-FAKE16-NEXT: s_endpgm
@@ -9759,11 +9772,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1132-TRUE16-NEXT: s_mov_b32 s10, 0
-; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1132-TRUE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1132-TRUE16-NEXT: ; %bb.1:
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -9781,27 +9794,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-TRUE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1132-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9810,7 +9823,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX1132-TRUE16-NEXT: s_endpgm
@@ -9822,11 +9835,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1132-FAKE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1132-FAKE16-NEXT: s_mov_b32 s10, 0
-; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1132-FAKE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1132-FAKE16-NEXT: ; %bb.1:
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -9843,27 +9856,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-FAKE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1132-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1132-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-FAKE16-NEXT: .LBB16_4: ; %Flow
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9872,7 +9885,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2
+; GFX1132-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2
; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX1132-FAKE16-NEXT: s_endpgm
@@ -9886,9 +9899,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
-; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1264-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1264-TRUE16-NEXT: ; %bb.1:
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
@@ -9909,28 +9922,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9940,7 +9954,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v2.l, s2
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1264-TRUE16-NEXT: s_endpgm
@@ -9954,9 +9968,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-FAKE16-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1264-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1264-FAKE16-NEXT: ; %bb.1:
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
@@ -9975,28 +9989,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v4
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT: v_and_or_b32 v3, v4, s13, v0
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s11, v0
; GFX1264-FAKE16-NEXT: .LBB16_4: ; %Flow
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -10006,7 +10021,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2
+; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v2, s2
; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1264-FAKE16-NEXT: s_endpgm
@@ -10018,11 +10033,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-TRUE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-TRUE16-NEXT: s_mov_b32 s10, 0
-; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1232-TRUE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1232-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1232-TRUE16-NEXT: ; %bb.1:
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
@@ -10043,27 +10058,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-TRUE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-TRUE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1232-TRUE16-NEXT: s_or_b32 s10, vcc_lo, s10
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -10073,7 +10089,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v2.l, s2
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-TRUE16-NEXT: s_endpgm
@@ -10085,11 +10101,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x34
; GFX1232-FAKE16-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-FAKE16-NEXT: s_mov_b32 s10, 0
-; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1232-FAKE16-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1232-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB16_4
; GFX1232-FAKE16-NEXT: ; %bb.1:
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
@@ -10109,27 +10125,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-FAKE16-NEXT: s_lshl_b32 s12, s6, s2
; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1
-; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v4, v0
+; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v4
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0
+; GFX1232-FAKE16-NEXT: v_and_or_b32 v3, v4, s11, v0
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
; GFX1232-FAKE16-NEXT: s_or_b32 s10, vcc_lo, s10
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s10
; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB16_2
; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-FAKE16-NEXT: .LBB16_4: ; %Flow
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -10139,7 +10156,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2
+; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v2, s2
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
@@ -10166,27 +10183,28 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX7LESS-NEXT: s_lshl_b32 s2, s3, s10
; GFX7LESS-NEXT: s_not_b32 s3, s11
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-NEXT: s_mov_b32 s6, -1
; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_and_b32_e32 v0, s3, v2
+; GFX7LESS-NEXT: v_or_b32_e32 v1, s2, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -10206,27 +10224,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX8-NEXT: s_lshl_b32 s10, s2, s8
; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_and_b32_e32 v0, s9, v1
-; GFX8-NEXT: v_or_b32_e32 v0, s10, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_and_b32_e32 v0, s9, v2
+; GFX8-NEXT: v_or_b32_e32 v1, s10, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -10246,27 +10264,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX9-NEXT: s_lshl_b32 s10, s2, s8
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_and_b32_e32 v0, s9, v1
-; GFX9-NEXT: v_or_b32_e32 v0, s10, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_and_b32_e32 v0, s9, v2
+; GFX9-NEXT: v_or_b32_e32 v1, s10, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -10288,23 +10306,23 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1064-NEXT: s_not_b32 s9, s2
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, s10
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT: v_and_or_b32 v1, v2, s9, s10
+; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -10328,23 +10346,23 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1032-NEXT: s_not_b32 s3, s3
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: v_mov_b32_e32 v0, s7
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, s8
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT: v_and_or_b32 v1, v2, s3, s8
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -10368,25 +10386,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1164-NEXT: s_not_b32 s9, s2
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_and_or_b32 v0, v1, s9, s10
-; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-NEXT: v_and_or_b32 v1, v2, s9, s10
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -10410,24 +10429,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1132-NEXT: s_not_b32 s3, s3
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_and_or_b32 v0, v1, s3, s8
-; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-NEXT: v_and_or_b32 v1, v2, s3, s8
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -10451,25 +10472,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1264-NEXT: s_not_b32 s9, s2
; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_and_or_b32 v0, v1, s9, s10
-; GFX1264-NEXT: v_mov_b32_e32 v3, v1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-NEXT: v_and_or_b32 v1, v2, s9, s10
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-NEXT: s_cbranch_execnz .LBB17_1
; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10493,24 +10515,26 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1232-NEXT: s_not_b32 s3, s3
; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_and_or_b32 v0, v1, s3, s8
-; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-NEXT: v_and_or_b32 v1, v2, s3, s8
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1232-NEXT: s_cbranch_execnz .LBB17_1
; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10536,35 +10560,36 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10
-; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX7LESS-NEXT: s_not_b32 s2, s2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s3
; GFX7LESS-NEXT: s_mov_b32 s6, -1
; GFX7LESS-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v4
; GFX7LESS-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2
; GFX7LESS-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_and_b32_e32 v2, s2, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX7LESS-NEXT: v_and_b32_e32 v1, s2, v4
+; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -10584,28 +10609,28 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX8-NEXT: s_lshl_b32 s2, 0xffff, s10
; GFX8-NEXT: s_not_b32 s2, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: v_add_f16_e32 v0, s11, v0
-; GFX8-NEXT: v_and_b32_e32 v2, s2, v1
+; GFX8-NEXT: v_and_b32_e32 v1, s2, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -10625,27 +10650,27 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s10
; GFX9-NEXT: s_not_b32 s2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX9-NEXT: v_add_f16_e32 v0, s11, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX9-NEXT: v_and_or_b32 v0, v1, s2, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_and_or_b32 v1, v2, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_cbranch_execnz .LBB18_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -10665,26 +10690,26 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1064-NEXT: s_lshl_b32 s2, 0xffff, s9
; GFX1064-NEXT: s_not_b32 s10, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s3
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v2
; GFX1064-NEXT: v_add_f16_e32 v0, s8, v0
; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX1064-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT: v_and_or_b32 v1, v2, s10, v0
+; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB18_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v2
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -10706,26 +10731,26 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1032-NEXT: s_lshl_b32 s3, 0xffff, s2
; GFX1032-NEXT: s_not_b32 s3, s3
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s6
+; GFX1032-NEXT: v_mov_b32_e32 v0, s6
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1032-NEXT: v_add_f16_e32 v0, s8, v0
; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT: v_and_or_b32 v1, v2, s3, v0
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_cbranch_execnz .LBB18_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -10748,31 +10773,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1164-TRUE16-NEXT: s_lshl_b32 s2, 0xffff, s9
; GFX1164-TRUE16-NEXT: s_not_b32 s10, s2
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
-; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: v_and_or_b32 v1, v2, s10, v0
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
+; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -10795,31 +10821,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1164-FAKE16-NEXT: s_lshl_b32 s2, 0xffff, s9
; GFX1164-FAKE16-NEXT: s_not_b32 s10, s2
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
-; GFX1164-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: v_and_or_b32 v1, v2, s10, v0
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
+; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0
; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -10842,30 +10869,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1132-TRUE16-NEXT: s_lshl_b32 s3, 0xffff, s2
; GFX1132-TRUE16-NEXT: s_not_b32 s3, s3
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s6
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX1132-TRUE16-NEXT: v_and_or_b32 v1, v2, s3, v0
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-TRUE16-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -10888,30 +10917,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1132-FAKE16-NEXT: s_lshl_b32 s3, 0xffff, s2
; GFX1132-FAKE16-NEXT: s_not_b32 s3, s3
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s6
; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1132-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT: v_and_or_b32 v1, v2, s3, v0
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-FAKE16-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -10934,31 +10965,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1264-TRUE16-NEXT: s_lshl_b32 s2, 0xffff, s9
; GFX1264-TRUE16-NEXT: s_not_b32 s10, s2
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
-; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT: v_and_or_b32 v1, v2, s10, v0
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
+; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10981,31 +11013,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1264-FAKE16-NEXT: s_lshl_b32 s2, 0xffff, s9
; GFX1264-FAKE16-NEXT: s_not_b32 s10, s2
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s3
; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
-; GFX1264-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT: v_and_or_b32 v1, v2, s10, v0
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v2
+; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v0
; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11028,30 +11061,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1232-TRUE16-NEXT: s_lshl_b32 s3, 0xffff, s2
; GFX1232-TRUE16-NEXT: s_not_b32 s3, s3
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s6
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX1232-TRUE16-NEXT: v_and_or_b32 v1, v2, s3, v0
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-TRUE16-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11074,30 +11109,32 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1232-FAKE16-NEXT: s_lshl_b32 s3, 0xffff, s2
; GFX1232-FAKE16-NEXT: s_not_b32 s3, s3
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s6
; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1
; GFX1232-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1232-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0
; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT: v_and_or_b32 v1, v2, s3, v0
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-FAKE16-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11123,35 +11160,36 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX7LESS-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX7LESS-NEXT: v_mul_f32_e64 v0, 1.0, s6
; GFX7LESS-NEXT: s_lshl_b32 s2, 0xffff, s10
-; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; GFX7LESS-NEXT: s_not_b32 s2, s2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s3
; GFX7LESS-NEXT: s_mov_b32 s6, -1
; GFX7LESS-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v4
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7LESS-NEXT: v_add_f32_e32 v0, v0, v2
; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_and_b32_e32 v2, s2, v1
-; GFX7LESS-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX7LESS-NEXT: v_and_b32_e32 v1, s2, v4
+; GFX7LESS-NEXT: v_or_b32_e32 v3, v1, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -11170,37 +11208,37 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX8-NEXT: s_lshl_b32 s2, s6, 16
; GFX8-NEXT: s_not_b32 s3, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, s2, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s3, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, s10
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_lshrrev_b32_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v0, s2, v0
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, s3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -11220,34 +11258,34 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX9-NEXT: s_lshl_b32 s2, s6, 16
; GFX9-NEXT: s_not_b32 s3, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_lshrrev_b32_sdwa v0, s10, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_add_f32_e32 v0, s2, v0
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s11
+; GFX9-NEXT: v_add3_u32 v1, v1, v0, s11
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_and_or_b32 v1, v2, s3, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_cbranch_execnz .LBB19_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s10, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -11268,31 +11306,31 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1064-NEXT: s_not_b32 s9, s2
; GFX1064-NEXT: s_mov_b64 s[2:3], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s7
+; GFX1064-NEXT: v_mov_b32_e32 v0, s7
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: v_lshrrev_b32_sdwa v0, s8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX1064-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1064-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1064-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1064-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1064-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1064-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1064-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, v0
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT: v_and_or_b32 v1, v2, s9, v0
+; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB19_1
; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -11315,31 +11353,31 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1032-NEXT: s_not_b32 s8, s3
; GFX1032-NEXT: s_mov_b32 s3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: v_mov_b32_e32 v0, s7
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_lshrrev_b32_sdwa v0, s2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: v_lshrrev_b32_sdwa v0, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX1032-NEXT: v_add_f32_e32 v0, s9, v0
-; GFX1032-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1032-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1032-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1032-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1032-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1032-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX1032-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT: v_and_or_b32 v1, v2, s8, v0
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: s_cbranch_execnz .LBB19_1
; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -11362,41 +11400,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-TRUE16-NEXT: s_not_b32 s9, s2
; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-TRUE16-NEXT: .p2align 6
; GFX1164-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1
-; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1164-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2
+; GFX1164-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v1
+; GFX1164-TRUE16-NEXT: v_and_or_b32 v1, v2, s9, v0
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1164-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -11419,40 +11458,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-FAKE16-NEXT: s_not_b32 s9, s2
; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-FAKE16-NEXT: .p2align 6
; GFX1164-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1
-; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1164-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1164-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1164-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: v_and_or_b32 v1, v2, s9, v0
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1164-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -11475,40 +11515,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1132-TRUE16-NEXT: s_not_b32 s8, s3
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-TRUE16-NEXT: .p2align 6
; GFX1132-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1132-TRUE16-NEXT: v_add_f32_e32 v0, s9, v0
-; GFX1132-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1132-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1132-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
+; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v1
+; GFX1132-TRUE16-NEXT: v_and_or_b32 v1, v2, s8, v0
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-TRUE16-NEXT: s_or_b32 s3, vcc_lo, s3
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1132-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -11531,39 +11573,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1132-FAKE16-NEXT: s_not_b32 s8, s3
; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-FAKE16-NEXT: .p2align 6
; GFX1132-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1132-FAKE16-NEXT: v_add_f32_e32 v0, s9, v0
-; GFX1132-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1132-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1132-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1132-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT: v_and_or_b32 v1, v2, s8, v0
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-FAKE16-NEXT: s_or_b32 s3, vcc_lo, s3
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1132-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -11586,41 +11630,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1264-TRUE16-NEXT: s_not_b32 s9, s2
; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1
-; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1264-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1264-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1264-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1264-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1264-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v1
+; GFX1264-TRUE16-NEXT: v_and_or_b32 v1, v2, s9, v0
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11643,40 +11688,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1264-FAKE16-NEXT: s_not_b32 s9, s2
; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v1
-; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1264-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1264-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1264-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1264-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1264-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT: v_and_or_b32 v1, v2, s9, v0
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s8, v0
; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11699,40 +11745,42 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1232-TRUE16-NEXT: s_not_b32 s8, s3
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1232-TRUE16-NEXT: v_add_f32_e32 v0, s9, v0
-; GFX1232-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1232-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1232-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1232-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
+; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v1
+; GFX1232-TRUE16-NEXT: v_and_or_b32 v1, v2, s8, v0
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-TRUE16-NEXT: s_or_b32 s3, vcc_lo, s3
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11755,39 +11803,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1232-FAKE16-NEXT: s_not_b32 s8, s3
; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s7
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7
; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
-; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1232-FAKE16-NEXT: v_add_f32_e32 v0, s9, v0
-; GFX1232-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1232-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1232-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1232-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0
-; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT: v_and_or_b32 v1, v2, s8, v0
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-FAKE16-NEXT: s_or_b32 s3, vcc_lo, s3
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -11869,28 +11919,28 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX8-NEXT: s_lshr_b32 s11, s10, 16
; GFX8-NEXT: s_mov_b32 s4, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: s_mov_b32 s5, s3
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NEXT: v_add_f16_e32 v2, s10, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_add_f16_e32 v1, s10, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: uniform_fadd_v2f16:
@@ -11904,25 +11954,25 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_pk_add_f16 v0, v1, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_pk_add_f16 v1, v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_cbranch_execnz .LBB20_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: uniform_fadd_v2f16:
@@ -11937,17 +11987,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX1064-NEXT: s_mov_b32 s5, s3
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v0, s4
; GFX1064-NEXT: s_mov_b32 s4, s2
; GFX1064-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_pk_add_f16 v0, v1, s10
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT: v_pk_add_f16 v1, v2, s10
+; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1064-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_cbranch_execnz .LBB20_1
@@ -11955,7 +12005,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v2, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: uniform_fadd_v2f16:
@@ -11970,17 +12020,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX1032-NEXT: s_mov_b32 s5, s3
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s4
; GFX1032-NEXT: s_mov_b32 s4, s2
; GFX1032-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_pk_add_f16 v0, v1, s8
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT: v_pk_add_f16 v1, v2, s8
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1032-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_cbranch_execnz .LBB20_1
@@ -11988,7 +12038,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v2, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: uniform_fadd_v2f16:
@@ -12003,18 +12053,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1164-NEXT: s_load_b32 s4, s[2:3], 0x0
; GFX1164-NEXT: s_mov_b32 s5, s3
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v0, s4
; GFX1164-NEXT: s_mov_b32 s4, s2
; GFX1164-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_pk_add_f16 v0, v1, s10
-; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-NEXT: v_pk_add_f16 v1, v2, s10
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[8:9]
@@ -12023,7 +12074,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v2, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: uniform_fadd_v2f16:
@@ -12038,17 +12089,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1132-NEXT: s_load_b32 s4, s[2:3], 0x0
; GFX1132-NEXT: s_mov_b32 s5, s3
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s4
; GFX1132-NEXT: s_mov_b32 s4, s2
; GFX1132-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_pk_add_f16 v0, v1, s8
-; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-NEXT: v_pk_add_f16 v1, v2, s8
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
@@ -12057,7 +12110,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v2, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: uniform_fadd_v2f16:
@@ -12072,18 +12125,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x0
; GFX1264-NEXT: s_mov_b32 s5, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mov_b32_e32 v1, s4
+; GFX1264-NEXT: v_mov_b32_e32 v0, s4
; GFX1264-NEXT: s_mov_b32 s4, s2
; GFX1264-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_pk_add_f16 v0, v1, s10
-; GFX1264-NEXT: v_mov_b32_e32 v3, v1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-NEXT: v_pk_add_f16 v1, v2, s10
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[8:9]
@@ -12092,7 +12146,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v2, off, s[0:3], null
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: uniform_fadd_v2f16:
@@ -12107,17 +12161,19 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1232-NEXT: s_load_b32 s4, s[2:3], 0x0
; GFX1232-NEXT: s_mov_b32 s5, s3
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mov_b32_e32 v1, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s4
; GFX1232-NEXT: s_mov_b32 s4, s2
; GFX1232-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_pk_add_f16 v0, v1, s8
-; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-NEXT: v_pk_add_f16 v1, v2, s8
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
@@ -12126,7 +12182,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v2, off, s[0:3], null
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
%rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4
store <2 x half> %rmw, ptr addrspace(1) %result
@@ -12204,41 +12260,41 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX8-NEXT: s_mov_b32 s4, s10
; GFX8-NEXT: s_mov_b32 s5, s11
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX8-NEXT: v_add_f32_e32 v0, s12, v0
-; GFX8-NEXT: v_add_f32_e32 v2, s13, v2
+; GFX8-NEXT: v_add_f32_e32 v1, s13, v1
; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: uniform_fadd_v2bf16:
@@ -12255,40 +12311,40 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX9-NEXT: s_lshl_b32 s14, s0, 16
; GFX9-NEXT: s_and_b32 s15, s0, 0xffff0000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_mov_b32 s4, s10
; GFX9-NEXT: s_mov_b32 s5, s11
; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX9-NEXT: v_add_f32_e32 v0, s14, v0
-; GFX9-NEXT: v_add_f32_e32 v2, s15, v2
+; GFX9-NEXT: v_add_f32_e32 v1, s15, v1
; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX9-NEXT: v_add3_u32 v3, v3, v0, s12
-; GFX9-NEXT: v_add3_u32 v5, v5, v2, s12
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT: v_add3_u32 v5, v5, v1, s12
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT: v_perm_b32 v0, v2, v0, s13
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT: v_perm_b32 v1, v1, v0, s13
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB21_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: uniform_fadd_v2bf16:
@@ -12306,30 +12362,30 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1064-NEXT: s_mov_b32 s4, s10
; GFX1064-NEXT: s_mov_b32 s5, s11
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s1
+; GFX1064-NEXT: v_mov_b32_e32 v0, s1
; GFX1064-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1064-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1064-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1064-NEXT: v_add_f32_e32 v0, s12, v0
-; GFX1064-NEXT: v_add_f32_e32 v2, s13, v2
+; GFX1064-NEXT: v_add_f32_e32 v1, s13, v1
; GFX1064-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1064-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX1064-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX1064-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1064-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1064-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX1064-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1064-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1064-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX1064-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1064-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1064-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v2
+; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_execnz .LBB21_1
@@ -12337,7 +12393,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: uniform_fadd_v2bf16:
@@ -12354,31 +12410,31 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1032-NEXT: s_and_b32 s3, s0, 0xffff0000
; GFX1032-NEXT: s_mov_b32 s5, s11
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s4
; GFX1032-NEXT: s_mov_b32 s4, s10
; GFX1032-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1032-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1032-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX1032-NEXT: v_add_f32_e32 v0, s2, v0
-; GFX1032-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1032-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1032-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1032-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX1032-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX1032-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1032-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1032-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX1032-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1032-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1032-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX1032-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX1032-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0
-; GFX1032-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v2
+; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1032-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
; GFX1032-NEXT: s_cbranch_execnz .LBB21_1
@@ -12386,7 +12442,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12403,40 +12459,40 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-TRUE16-NEXT: s_lshl_b32 s11, s4, 16
; GFX1164-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s5
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX1164-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1164-TRUE16-NEXT: .p2align 6
; GFX1164-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1164-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1164-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0
+; GFX1164-TRUE16-NEXT: v_add_f32_e32 v1, s10, v1
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2
; GFX1164-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
-; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX1164-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
+; GFX1164-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9]
@@ -12446,7 +12502,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX1164-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-TRUE16-NEXT: s_endpgm
;
; GFX1164-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12464,37 +12520,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: s_mov_b32 s4, s10
; GFX1164-FAKE16-NEXT: s_mov_b32 s5, s11
; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s1
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s1
; GFX1164-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1164-FAKE16-NEXT: .p2align 6
; GFX1164-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1164-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1164-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0
+; GFX1164-FAKE16-NEXT: v_add_f32_e32 v1, s13, v1
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2
; GFX1164-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX1164-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -12504,7 +12560,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1164-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], 0
+; GFX1164-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX1164-FAKE16-NEXT: s_endpgm
;
; GFX1132-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12521,39 +12577,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1132-TRUE16-NEXT: s_lshl_b32 s10, s4, 16
; GFX1132-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX1132-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-TRUE16-NEXT: .p2align 6
; GFX1132-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1132-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_dual_add_f32 v1, s9, v1 :: v_dual_lshlrev_b32 v0, 16, v2
+; GFX1132-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-TRUE16-NEXT: v_add_f32_e32 v2, s9, v2
+; GFX1132-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1132-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1132-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1132-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1132-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX1132-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1132-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1132-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX1132-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1
+; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8
@@ -12563,7 +12619,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX1132-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-TRUE16-NEXT: s_endpgm
;
; GFX1132-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12580,37 +12636,38 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1132-FAKE16-NEXT: s_and_b32 s3, s0, 0xffff0000
; GFX1132-FAKE16-NEXT: s_mov_b32 s5, s11
; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s4
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s4
; GFX1132-FAKE16-NEXT: s_mov_b32 s4, s10
; GFX1132-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX1132-FAKE16-NEXT: .p2align 6
; GFX1132-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1132-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1132-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_dual_add_f32 v1, s3, v1 :: v_dual_lshlrev_b32 v0, 16, v2
+; GFX1132-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132-FAKE16-NEXT: v_add_f32_e32 v0, s2, v0
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-FAKE16-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1132-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1132-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1132-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1132-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1132-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1132-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1132-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -12620,7 +12677,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1132-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], 0
+; GFX1132-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX1132-FAKE16-NEXT: s_endpgm
;
; GFX1264-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12637,39 +12694,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1264-TRUE16-NEXT: s_lshl_b32 s11, s4, 16
; GFX1264-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s5
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX1264-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1264-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1264-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1264-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0
+; GFX1264-TRUE16-NEXT: v_add_f32_e32 v1, s10, v1
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2
; GFX1264-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX1264-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX1264-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX1264-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX1264-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1264-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
-; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1
+; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9]
@@ -12678,7 +12735,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX1264-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null
+; GFX1264-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1264-TRUE16-NEXT: s_endpgm
;
; GFX1264-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12696,37 +12753,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1264-FAKE16-NEXT: s_mov_b32 s4, s10
; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s11
; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s1
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s1
; GFX1264-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1264-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1264-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1264-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0
+; GFX1264-FAKE16-NEXT: v_add_f32_e32 v1, s13, v1
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2
; GFX1264-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX1264-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX1264-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1264-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1264-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX1264-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
@@ -12735,7 +12792,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1264-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null
+; GFX1264-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX1264-FAKE16-NEXT: s_endpgm
;
; GFX1232-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12752,38 +12809,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1232-TRUE16-NEXT: s_lshl_b32 s10, s4, 16
; GFX1232-TRUE16-NEXT: s_mov_b32 s4, s2
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s3
; GFX1232-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: v_dual_add_f32 v1, s9, v1 :: v_dual_lshlrev_b32 v0, 16, v2
+; GFX1232-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1232-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_add_f32_e32 v2, s9, v2
+; GFX1232-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1232-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1232-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX1232-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1232-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v0, v1
+; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8
@@ -12792,7 +12850,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX1232-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null
+; GFX1232-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-TRUE16-NEXT: s_endpgm
;
; GFX1232-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12809,37 +12867,38 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1232-FAKE16-NEXT: s_and_b32 s3, s0, 0xffff0000
; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s11
; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s4
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s4
; GFX1232-FAKE16-NEXT: s_mov_b32 s4, s10
; GFX1232-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1232-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX1232-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_dual_add_f32 v1, s3, v1 :: v_dual_lshlrev_b32 v0, 16, v2
+; GFX1232-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1232-FAKE16-NEXT: v_add_f32_e32 v0, s2, v0
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1232-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1232-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX1232-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1232-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1232-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX1232-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0
+; GFX1232-FAKE16-NEXT: v_perm_b32 v1, v1, v0, 0x7060302
+; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2
; GFX1232-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -12848,7 +12907,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1232-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null
+; GFX1232-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], null
; GFX1232-FAKE16-NEXT: s_endpgm
%rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4
store <2 x bfloat> %rmw, ptr addrspace(1) %result
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index ae90cfb631e8d..509ba295ea7f7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -32,12 +32,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.bb103:
- ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000)
+ ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.2(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.58, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
@@ -51,14 +51,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
- ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
+ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.58(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr15 = V_MOV_B32_e32 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr17 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.58, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
@@ -122,12 +122,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr24 = V_MOV_B32_e32 0, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.7.Flow19:
- ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000)
+ ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.8(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0
; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec
+ ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.63, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.8.Flow32:
; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000)
@@ -506,8 +506,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: S_BRANCH %bb.38
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.41.bb41:
- ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67
+ ; GFX90A-NEXT: successors: %bb.47(0x40000000), %bb.42(0x40000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr62_sgpr63, $sgpr66_sgpr67
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc
@@ -533,17 +533,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec
+ ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.42.Flow24:
; GFX90A-NEXT: successors: %bb.40(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
@@ -556,8 +556,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: S_BRANCH %bb.40
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.43.bb55:
- ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+ ; GFX90A-NEXT: successors: %bb.49(0x40000000), %bb.44(0x40000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
@@ -565,13 +565,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.49, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.44:
; GFX90A-NEXT: successors: %bb.45(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: bb.45.Flow26:
+ ; GFX90A-NEXT: successors: %bb.46(0x80000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63, $vgpr58, $vgpr61
+ ; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -586,8 +592,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.45.Flow26:
- ; GFX90A-NEXT: successors: %bb.47(0x80000000)
+ ; GFX90A-NEXT: bb.46.Flow26:
+ ; GFX90A-NEXT: successors: %bb.48(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -595,16 +601,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
- ; GFX90A-NEXT: S_BRANCH %bb.47
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
+ ; GFX90A-NEXT: S_BRANCH %bb.48
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.46.bb48:
- ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+ ; GFX90A-NEXT: bb.47.bb48:
+ ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.48(0x40000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr62_sgpr63, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr58_sgpr59, $sgpr56_sgpr57
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc
@@ -613,7 +619,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51)
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1
- ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = COPY renamable $sgpr36_sgpr37
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
@@ -634,53 +640,56 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.47.Flow25:
+ ; GFX90A-NEXT: bb.48.Flow25:
; GFX90A-NEXT: successors: %bb.42(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
; GFX90A-NEXT: S_BRANCH %bb.42
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.48.bb63:
- ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000)
- ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+ ; GFX90A-NEXT: bb.49.bb63:
+ ; GFX90A-NEXT: successors: %bb.51(0x40000000), %bb.50(0x40000000)
+ ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.51, implicit $vcc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.49:
- ; GFX90A-NEXT: successors: %bb.44(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+ ; GFX90A-NEXT: bb.50:
+ ; GFX90A-NEXT: successors: %bb.45(0x80000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1
- ; GFX90A-NEXT: S_BRANCH %bb.44
+ ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0
+ ; GFX90A-NEXT: S_BRANCH %bb.45
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.50.bb68:
- ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+ ; GFX90A-NEXT: bb.51.bb68:
+ ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.52(0x40000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr56_sgpr57
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.55, implicit $vcc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.51:
- ; GFX90A-NEXT: successors: %bb.45(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+ ; GFX90A-NEXT: bb.52:
+ ; GFX90A-NEXT: successors: %bb.46(0x80000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -692,20 +701,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
- ; GFX90A-NEXT: S_BRANCH %bb.45
+ ; GFX90A-NEXT: S_BRANCH %bb.46
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.52.bb80:
- ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000)
+ ; GFX90A-NEXT: bb.53.bb80:
+ ; GFX90A-NEXT: successors: %bb.60(0x40000000), %bb.54(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc
+ ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.60, implicit killed $scc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.53:
- ; GFX90A-NEXT: successors: %bb.61(0x80000000)
+ ; GFX90A-NEXT: bb.54:
+ ; GFX90A-NEXT: successors: %bb.62(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0
@@ -720,17 +729,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
- ; GFX90A-NEXT: S_BRANCH %bb.61
+ ; GFX90A-NEXT: S_BRANCH %bb.62
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.54.bb73:
- ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
+ ; GFX90A-NEXT: bb.55.bb73:
+ ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.56(0x40000000)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr6 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec
; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0
@@ -745,17 +755,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec
+ ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.55.Flow29:
- ; GFX90A-NEXT: successors: %bb.45(0x80000000)
+ ; GFX90A-NEXT: bb.56.Flow29:
+ ; GFX90A-NEXT: successors: %bb.46(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
- ; GFX90A-NEXT: S_BRANCH %bb.45
+ ; GFX90A-NEXT: S_BRANCH %bb.46
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.56.bb90:
- ; GFX90A-NEXT: successors: %bb.60(0x80000000)
+ ; GFX90A-NEXT: bb.57.bb90:
+ ; GFX90A-NEXT: successors: %bb.61(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
@@ -773,9 +783,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
- ; GFX90A-NEXT: S_BRANCH %bb.60
+ ; GFX90A-NEXT: S_BRANCH %bb.61
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.57:
+ ; GFX90A-NEXT: bb.58:
; GFX90A-NEXT: successors: %bb.7(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
@@ -810,7 +820,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0
; GFX90A-NEXT: S_BRANCH %bb.7
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.58.bb105:
+ ; GFX90A-NEXT: bb.59.bb105:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
@@ -827,8 +837,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1
; GFX90A-NEXT: S_BRANCH %bb.3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.59.bb85:
- ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000)
+ ; GFX90A-NEXT: bb.60.bb85:
+ ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.61(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
@@ -846,17 +856,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec
+ ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.60.Flow31:
- ; GFX90A-NEXT: successors: %bb.61(0x80000000)
+ ; GFX90A-NEXT: bb.61.Flow31:
+ ; GFX90A-NEXT: successors: %bb.62(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.61.Flow30:
- ; GFX90A-NEXT: successors: %bb.55(0x80000000)
+ ; GFX90A-NEXT: bb.62.Flow30:
+ ; GFX90A-NEXT: successors: %bb.56(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -865,48 +875,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc
- ; GFX90A-NEXT: S_BRANCH %bb.55
+ ; GFX90A-NEXT: S_BRANCH %bb.56
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.62.bb140:
- ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000)
+ ; GFX90A-NEXT: bb.63.bb140:
+ ; GFX90A-NEXT: successors: %bb.69(0x40000000), %bb.64(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.69, implicit $vcc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.63.Flow13:
- ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000)
+ ; GFX90A-NEXT: bb.64.Flow13:
+ ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.67(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.67, implicit $vcc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.64.bb159:
- ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000)
+ ; GFX90A-NEXT: bb.65.bb159:
+ ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.66(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec
+ ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.68, implicit $exec
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.65.Flow10:
- ; GFX90A-NEXT: successors: %bb.66(0x80000000)
+ ; GFX90A-NEXT: bb.66.Flow10:
+ ; GFX90A-NEXT: successors: %bb.67(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.66.Flow14:
+ ; GFX90A-NEXT: bb.67.Flow14:
; GFX90A-NEXT: successors: %bb.8(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec
; GFX90A-NEXT: S_BRANCH %bb.8
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.67.bb161:
- ; GFX90A-NEXT: successors: %bb.65(0x80000000)
+ ; GFX90A-NEXT: bb.68.bb161:
+ ; GFX90A-NEXT: successors: %bb.66(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
@@ -922,10 +932,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
- ; GFX90A-NEXT: S_BRANCH %bb.65
+ ; GFX90A-NEXT: S_BRANCH %bb.66
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.68.bb174:
- ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000)
+ ; GFX90A-NEXT: bb.69.bb174:
+ ; GFX90A-NEXT: successors: %bb.73(0x40000000), %bb.70(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
@@ -938,17 +948,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.73, implicit $vcc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.69.Flow:
- ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000)
+ ; GFX90A-NEXT: bb.70.Flow:
+ ; GFX90A-NEXT: successors: %bb.71(0x40000000), %bb.72(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
- ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc
+ ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.70.bb186:
- ; GFX90A-NEXT: successors: %bb.71(0x80000000)
+ ; GFX90A-NEXT: bb.71.bb186:
+ ; GFX90A-NEXT: successors: %bb.72(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
@@ -976,15 +986,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.71.Flow9:
- ; GFX90A-NEXT: successors: %bb.63(0x80000000)
+ ; GFX90A-NEXT: bb.72.Flow9:
+ ; GFX90A-NEXT: successors: %bb.64(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0
- ; GFX90A-NEXT: S_BRANCH %bb.63
+ ; GFX90A-NEXT: S_BRANCH %bb.64
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: bb.72.bb196:
- ; GFX90A-NEXT: successors: %bb.69(0x80000000)
+ ; GFX90A-NEXT: bb.73.bb196:
+ ; GFX90A-NEXT: successors: %bb.70(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
@@ -992,7 +1002,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0
- ; GFX90A-NEXT: S_BRANCH %bb.69
+ ; GFX90A-NEXT: S_BRANCH %bb.70
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
%i11 = icmp eq i32 %i, 0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 348862d4d8ced..36370361b677d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -248,20 +248,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB1_1
@@ -291,22 +291,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB1_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -317,22 +317,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s20
-; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB1_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -343,23 +343,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s20
-; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB1_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1038,20 +1038,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
@@ -1063,21 +1063,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1088,22 +1088,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1114,22 +1114,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB4_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1140,22 +1140,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s20
-; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s6
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB4_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1166,23 +1166,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s20
-; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_mov_b32_e32 v3, s6
; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB4_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2137,20 +2137,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1]
-; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
+; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2178,21 +2178,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v6, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
+; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2208,23 +2208,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX10-NEXT: s_add_i32 s4, s20, 0x800
; GFX10-NEXT: v_mov_b32_e32 v6, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v10, v5
-; GFX10-NEXT: v_mov_b32_e32 v9, v4
+; GFX10-NEXT: v_mov_b32_e32 v10, v3
+; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v8, v3
-; GFX10-NEXT: v_mov_b32_e32 v7, v2
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v2, v7
+; GFX10-NEXT: v_mov_b32_e32 v3, v8
+; GFX10-NEXT: v_mov_b32_e32 v4, v9
+; GFX10-NEXT: v_mov_b32_e32 v5, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v4, v7
-; GFX10-NEXT: v_mov_b32_e32 v5, v8
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
@@ -2245,25 +2245,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, s20
-; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v10, v5
-; GFX908-NEXT: v_mov_b32_e32 v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, v3
-; GFX908-NEXT: v_mov_b32_e32 v7, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v3
+; GFX908-NEXT: v_mov_b32_e32 v9, v2
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v9
+; GFX908-NEXT: v_mov_b32_e32 v5, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v7
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2274,25 +2274,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, v5
-; GFX8-NEXT: v_mov_b32_e32 v9, v4
-; GFX8-NEXT: v_mov_b32_e32 v8, v3
-; GFX8-NEXT: v_mov_b32_e32 v7, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v3
+; GFX8-NEXT: v_mov_b32_e32 v9, v2
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
+; GFX8-NEXT: v_mov_b32_e32 v4, v9
+; GFX8-NEXT: v_mov_b32_e32 v5, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v7
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2303,25 +2303,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX7-NEXT: s_add_i32 s6, s20, 0x800
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v10, v5
-; GFX7-NEXT: v_mov_b32_e32 v9, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v10, v3
+; GFX7-NEXT: v_mov_b32_e32 v9, v2
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v7
+; GFX7-NEXT: v_mov_b32_e32 v3, v8
+; GFX7-NEXT: v_mov_b32_e32 v4, v9
+; GFX7-NEXT: v_mov_b32_e32 v5, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v5, v8
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2332,26 +2332,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s20
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
; GFX6-NEXT: s_add_i32 s6, s20, 0x800
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: v_mov_b32_e32 v6, s6
; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v10, v3
+; GFX6-NEXT: v_mov_b32_e32 v9, v2
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[0:1]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, v5
-; GFX6-NEXT: v_mov_b32_e32 v9, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v2, v7
+; GFX6-NEXT: v_mov_b32_e32 v3, v8
+; GFX6-NEXT: v_mov_b32_e32 v4, v9
+; GFX6-NEXT: v_mov_b32_e32 v5, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v5, v8
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB9_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3430,41 +3430,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3479,41 +3479,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3523,7 +3523,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -3532,23 +3532,23 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5
; GFX942-NEXT: v_add_f16_e32 v2, v2, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3557,40 +3557,40 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3599,40 +3599,40 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3640,35 +3640,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3677,7 +3677,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -3686,22 +3686,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5
; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3709,8 +3709,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v5, s4
-; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -3719,23 +3719,23 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5
; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3743,8 +3743,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -3753,24 +3753,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5
; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT: v_and_b32_e32 v3, s7, v2
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3778,38 +3778,38 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -3818,39 +3818,39 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3877,28 +3877,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3925,28 +3925,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3962,7 +3962,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
; GFX942-NEXT: v_mov_b32_e32 v1, s4
-; GFX942-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -3971,18 +3971,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5
; GFX942-NEXT: v_add_f16_e32 v2, v2, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4000,28 +4000,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4041,28 +4041,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4080,25 +4080,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
@@ -4112,7 +4112,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -4121,17 +4121,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5
; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4144,7 +4144,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
; GFX908-NEXT: v_mov_b32_e32 v3, s4
-; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -4153,18 +4153,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5
; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4177,7 +4177,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
@@ -4186,19 +4186,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5
; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT: v_and_b32_e32 v4, s7, v2
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4210,33 +4210,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4248,34 +4248,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4299,11 +4299,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4317,7 +4317,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-TRUE16-NEXT: ; %bb.2:
@@ -4327,17 +4327,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4352,14 +4354,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -4367,7 +4368,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4382,11 +4383,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, -4, v6
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v10, v7
; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4400,7 +4401,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -4410,17 +4411,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, v6
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX12-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v7, v8, v10, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4435,14 +4438,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4450,7 +4452,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4475,7 +4477,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX942-NEXT: buffer_load_dword v6, v10, s[4:7], 0 offen
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2:
@@ -4485,12 +4487,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB15_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, v4, v9
; GFX942-NEXT: v_add_f16_e32 v6, v6, v5
; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX942-NEXT: v_and_or_b32 v8, v9, v11, v6
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4504,21 +4507,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
; GFX942-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX942-NEXT: s_mov_b64 exec, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v8
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB15_3
; GFX942-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4529,11 +4531,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4545,7 +4547,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -4555,17 +4557,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4579,14 +4583,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -4595,7 +4598,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4606,11 +4609,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, -4, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v10, v7
; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4622,7 +4625,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -4632,17 +4635,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX11-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, v8, v10, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4656,14 +4661,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -4672,7 +4676,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4682,10 +4686,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX10-NEXT: v_and_b32_e32 v9, -4, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v11, v7
+; GFX10-NEXT: v_not_b32_e32 v10, v7
; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
; GFX10-NEXT: v_readfirstlane_b32 s9, v1
@@ -4695,7 +4699,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v6, v9, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB15_1
@@ -4705,14 +4709,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB15_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX10-NEXT: v_mov_b32_e32 v8, v6
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX10-NEXT: v_add_f16_e32 v6, v6, v5
; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT: v_mov_b32_e32 v9, v7
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: v_and_or_b32 v7, v8, v10, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v8
; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
; GFX10-NEXT: v_readfirstlane_b32 s8, v0
@@ -4724,15 +4729,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB15_4
; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -4741,7 +4745,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_cbranch_execnz .LBB15_3
; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4765,7 +4769,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v6, v10, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2:
@@ -4775,12 +4779,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v9
; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5
; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT: v_and_or_b32 v8, v9, v11, v6
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -4792,33 +4797,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v8
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_3
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v9, -4, v4
; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v11, v6
+; GFX908-NEXT: v_not_b32_e32 v10, v6
; GFX908-NEXT: s_mov_b64 s[6:7], exec
; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -4830,7 +4834,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v6, v9, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2:
@@ -4840,13 +4844,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB15_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX908-NEXT: v_add_f16_e32 v6, v6, v5
; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT: v_mov_b32_e32 v9, v7
+; GFX908-NEXT: v_and_or_b32 v7, v8, v10, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: v_mov_b32_e32 v7, v8
; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_readfirstlane_b32 s8, v0
@@ -4858,33 +4863,32 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_4
; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB15_3
; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v9, -4, v4
; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v11, v6
+; GFX8-NEXT: v_not_b32_e32 v10, v6
; GFX8-NEXT: s_mov_b64 s[6:7], exec
; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -4896,7 +4900,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v6, v9, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2:
@@ -4906,14 +4910,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB15_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v8
; GFX8-NEXT: v_add_f16_e32 v6, v6, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT: v_mov_b32_e32 v9, v7
+; GFX8-NEXT: v_and_b32_e32 v7, v8, v10
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v7, v8
; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_readfirstlane_b32 s8, v0
@@ -4925,21 +4930,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_4
; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX8-NEXT: s_mov_b64 exec, s[12:13]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB15_3
; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4961,18 +4965,19 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5
; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
@@ -5002,7 +5007,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB15_3
@@ -5031,18 +5035,20 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5
; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
@@ -5072,7 +5078,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB15_3
@@ -5100,55 +5105,55 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5160,54 +5165,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5216,36 +5221,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
-; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5257,147 +5262,147 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5405,34 +5410,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5445,40 +5450,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
-; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5486,42 +5491,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5529,38 +5534,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5569,39 +5574,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5619,48 +5624,48 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5678,47 +5683,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5733,36 +5738,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5773,46 +5778,46 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5825,45 +5830,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5876,36 +5881,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -5918,34 +5923,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5957,35 +5962,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s4
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5997,37 +6002,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6039,33 +6044,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6077,34 +6082,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6146,7 +6151,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-TRUE16-NEXT: ; %bb.2:
@@ -6157,26 +6162,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6200,7 +6206,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -6241,7 +6246,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -6252,25 +6257,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6294,7 +6300,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -6327,7 +6332,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2:
@@ -6339,6 +6344,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB18_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_add_f32_e32 v4, v4, v11
@@ -6372,7 +6378,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB18_3
@@ -6405,7 +6410,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -6417,25 +6422,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6458,7 +6464,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6495,7 +6500,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -6507,24 +6512,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6547,7 +6553,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6580,7 +6585,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
@@ -6591,9 +6596,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_add_f32_e32 v4, v4, v10
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
@@ -6623,7 +6629,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -6656,7 +6661,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2:
@@ -6668,6 +6673,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -6698,7 +6704,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
@@ -6728,7 +6733,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2:
@@ -6740,6 +6745,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_add_f32_e32 v4, v4, v10
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -6771,7 +6777,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB18_3
@@ -6801,7 +6806,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2:
@@ -6812,6 +6817,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_add_f32_e32 v4, v4, v10
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -6845,7 +6851,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB18_3
@@ -6873,18 +6878,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
@@ -6914,7 +6920,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB18_3
@@ -6943,18 +6948,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
@@ -6984,7 +6991,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB18_3
@@ -7285,21 +7291,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7315,20 +7320,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -7358,24 +7363,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v2, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8277,21 +8282,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8307,20 +8311,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
@@ -8332,21 +8336,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8357,22 +8361,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8383,24 +8387,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v2, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8804,21 +8808,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8834,20 +8837,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
@@ -8859,21 +8862,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8884,22 +8887,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s20
-; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB25_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8910,24 +8913,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s20
-; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v2, v5, v0
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB25_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9493,40 +9496,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX942-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9536,10 +9539,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -9547,34 +9551,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -9587,10 +9592,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -9598,32 +9604,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -9638,38 +9645,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -9681,39 +9688,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9727,37 +9734,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9771,38 +9778,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11100,40 +11107,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX942-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_cbranch_execnz .LBB30_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11143,10 +11150,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -11154,34 +11162,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -11194,10 +11203,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -11205,32 +11215,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -11245,38 +11256,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
@@ -11288,39 +11299,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11334,37 +11345,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11378,38 +11389,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11963,40 +11974,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX942-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12006,10 +12017,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -12017,34 +12029,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -12057,10 +12070,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -12068,32 +12082,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -12108,38 +12123,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB32_1
@@ -12151,39 +12166,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12197,37 +12212,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB32_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12241,38 +12256,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12389,40 +12404,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX942-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX942-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_cbranch_execnz .LBB33_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12432,10 +12447,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -12443,34 +12459,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -12483,10 +12500,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -12494,32 +12512,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -12534,38 +12553,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
@@ -12577,39 +12596,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12623,37 +12642,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB33_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12667,38 +12686,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ab867b089b875..3ad1e5c0b81e0 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, s6
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX942-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v4, v2, v0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -291,21 +291,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v1, v5, v5
+; GFX908-NEXT: v_max_f32_e32 v4, v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -319,21 +319,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB1_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1396,7 +1396,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
@@ -1405,17 +1405,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
+; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[2:3], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1439,7 +1440,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_add_i32 s4, s16, 0x800
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v6, s4
@@ -1448,18 +1449,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX11-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1493,26 +1495,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, s20
; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: v_mov_b32_e32 v10, v3
; GFX908-NEXT: v_mov_b32_e32 v9, v2
-; GFX908-NEXT: v_mov_b32_e32 v8, v1
-; GFX908-NEXT: v_mov_b32_e32 v7, v0
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX908-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v9
+; GFX908-NEXT: v_mov_b32_e32 v5, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1524,26 +1526,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v10, v3
; GFX8-NEXT: v_mov_b32_e32 v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, v0
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX8-NEXT: v_max_f64 v[7:8], v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
+; GFX8-NEXT: v_mov_b32_e32 v4, v9
+; GFX8-NEXT: v_mov_b32_e32 v5, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2499,42 +2501,43 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2546,46 +2549,47 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2594,30 +2598,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
-; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX942-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2632,125 +2636,127 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2758,29 +2764,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2793,35 +2799,35 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
-; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX908-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX908-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2829,36 +2835,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2866,38 +2872,38 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2906,39 +2912,39 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2966,29 +2972,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3006,39 +3013,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3053,30 +3061,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX942-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3095,30 +3103,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3131,38 +3140,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3175,32 +3185,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -3213,29 +3223,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3247,30 +3257,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s4
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX908-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX908-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3282,31 +3292,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3318,33 +3328,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3356,34 +3366,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3436,18 +3446,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -3462,14 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -3477,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3510,7 +3522,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -3521,17 +3533,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10
; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3555,7 +3569,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -3588,7 +3601,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2:
@@ -3599,6 +3612,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB12_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7
; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
; GFX942-NEXT: v_max_f16_e32 v4, v4, v11
@@ -3627,7 +3641,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB12_3
@@ -3671,18 +3684,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -3696,14 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -3712,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3739,7 +3754,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -3750,17 +3765,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3783,7 +3800,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -3815,7 +3831,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
@@ -3826,9 +3842,10 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB12_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
; GFX10-NEXT: v_max_f16_e32 v4, v4, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -3854,7 +3871,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -3887,7 +3903,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2:
@@ -3898,6 +3914,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11
@@ -3924,7 +3941,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_3
@@ -3954,7 +3970,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
; GFX908-NEXT: ; %bb.2:
@@ -3965,6 +3981,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB12_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
; GFX908-NEXT: v_max_f16_e32 v4, v4, v10
@@ -3992,7 +4009,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB12_3
@@ -4022,7 +4038,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2:
@@ -4033,6 +4049,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB12_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
; GFX8-NEXT: v_max_f16_e32 v4, v4, v10
@@ -4061,7 +4078,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB12_3
@@ -4089,18 +4105,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5
; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB12_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
@@ -4130,7 +4147,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB12_3
@@ -4159,18 +4175,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5
; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
@@ -4200,7 +4218,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB12_3
@@ -4228,55 +4245,55 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4288,54 +4305,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4344,36 +4361,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
-; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4385,147 +4402,147 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4533,34 +4550,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4573,40 +4590,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
-; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4614,42 +4631,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f32_e32 v3, v3, v5
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4657,39 +4674,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4698,40 +4715,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -4749,48 +4766,48 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4808,47 +4825,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4863,36 +4880,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4903,46 +4920,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4955,45 +4972,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5006,36 +5023,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
@@ -5048,34 +5065,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5087,35 +5104,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s4
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5127,37 +5144,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5169,34 +5186,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5208,35 +5225,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5278,7 +5295,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-TRUE16-NEXT: ; %bb.2:
@@ -5289,26 +5306,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5332,7 +5350,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5373,7 +5390,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -5384,25 +5401,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v10
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5426,7 +5444,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -5459,7 +5476,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2:
@@ -5471,6 +5488,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB15_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_max_f32_e32 v4, v4, v11
@@ -5504,7 +5522,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB15_3
@@ -5537,7 +5554,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -5549,25 +5566,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5590,7 +5608,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -5627,7 +5644,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -5639,24 +5656,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5679,7 +5697,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -5712,7 +5729,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB15_1
@@ -5723,9 +5740,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB15_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_max_f32_e32 v4, v4, v10
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
@@ -5755,7 +5773,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -5788,7 +5805,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2:
@@ -5800,6 +5817,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -5830,7 +5848,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_3
@@ -5860,7 +5877,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2:
@@ -5872,6 +5889,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB15_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_max_f32_e32 v4, v4, v10
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -5903,7 +5921,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB15_3
@@ -5933,7 +5950,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2:
@@ -5944,6 +5961,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB15_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_max_f32_e32 v4, v4, v10
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -5977,7 +5995,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB15_3
@@ -6005,18 +6022,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -6047,7 +6065,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB15_3
@@ -6076,18 +6093,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -6118,7 +6137,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB15_3
@@ -6468,7 +6486,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -6476,16 +6494,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
+; GFX12-NEXT: v_mov_b32_e32 v5, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_pk_max_num_f16 v1, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v4, v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6499,25 +6518,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, s6
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v5, v5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6529,24 +6548,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v4, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6560,24 +6580,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v4, v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -6589,23 +6609,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6619,21 +6639,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v0, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX908-NEXT: v_pk_max_f16 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX908-NEXT: v_pk_max_f16 v4, v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6647,25 +6667,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v6, v6
+; GFX8-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v5, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7931,43 +7951,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_max_num_f32 v0, v0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7984,42 +8006,44 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
-; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8033,40 +8057,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX942-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX942-NEXT: v_max_f32_e32 v3, v3, v0
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_cbranch_execnz .LBB20_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8076,10 +8100,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -8087,34 +8112,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_max_f32 v0, v0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8127,10 +8153,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -8138,32 +8165,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8178,38 +8206,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -8221,39 +8249,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX90A-NEXT: v_max_f32_e32 v3, v3, v0
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8267,37 +8295,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX908-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8311,38 +8339,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v0
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 1a25904dd553f..1f51c93d08db1 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, s6
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX942-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v4, v2, v0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
+; GFX90A-NEXT: v_min_f32_e32 v4, v2, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -291,21 +291,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
+; GFX908-NEXT: v_max_f32_e32 v0, v0, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f32_e32 v1, v5, v5
+; GFX908-NEXT: v_min_f32_e32 v4, v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -319,21 +319,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB1_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1396,7 +1396,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s16
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
@@ -1405,17 +1405,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
+; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[2:3], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1439,7 +1440,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: s_add_i32 s4, s16, 0x800
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v6, s4
@@ -1448,18 +1449,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX11-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1493,26 +1495,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, s20
; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX908-NEXT: s_add_i32 s6, s20, 0x800
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_mov_b32_e32 v6, s6
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX908-NEXT: v_mov_b32_e32 v10, v3
; GFX908-NEXT: v_mov_b32_e32 v9, v2
-; GFX908-NEXT: v_mov_b32_e32 v8, v1
-; GFX908-NEXT: v_mov_b32_e32 v7, v0
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX908-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_mov_b32_e32 v3, v8
+; GFX908-NEXT: v_mov_b32_e32 v4, v9
+; GFX908-NEXT: v_mov_b32_e32 v5, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v2, v7
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1524,26 +1526,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX8-NEXT: s_add_i32 s6, s20, 0x800
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v10, v3
; GFX8-NEXT: v_mov_b32_e32 v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v8, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, v0
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f64 v[2:3], v[9:10], v[9:10]
+; GFX8-NEXT: v_min_f64 v[7:8], v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_mov_b32_e32 v3, v8
+; GFX8-NEXT: v_mov_b32_e32 v4, v9
+; GFX8-NEXT: v_mov_b32_e32 v5, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v2, v7
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2499,42 +2501,43 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2546,46 +2549,47 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2594,30 +2598,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
-; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX942-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2632,125 +2636,127 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2758,29 +2764,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2793,35 +2799,35 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
-; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX908-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX908-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2829,36 +2835,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2866,38 +2872,38 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2906,39 +2912,39 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2966,29 +2972,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3006,39 +3013,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3053,30 +3061,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX942-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX942-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3095,30 +3103,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3131,38 +3140,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3175,32 +3185,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -3213,29 +3223,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v5
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3247,30 +3257,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s4
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX908-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX908-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX908-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3282,31 +3292,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3318,33 +3328,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3356,34 +3366,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3436,18 +3446,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
-; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -3462,14 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -3477,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3510,7 +3522,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -3521,17 +3533,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10
; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3555,7 +3569,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -3588,7 +3601,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2:
@@ -3599,6 +3612,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB12_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: v_lshrrev_b32_e32 v4, v8, v7
; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
; GFX942-NEXT: v_min_f16_e32 v4, v4, v11
@@ -3627,7 +3641,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB12_3
@@ -3671,18 +3684,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
-; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -3696,14 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -3712,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3739,7 +3754,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -3750,17 +3765,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3783,7 +3800,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -3815,7 +3831,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
@@ -3826,9 +3842,10 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB12_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
; GFX10-NEXT: v_min_f16_e32 v4, v4, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -3854,7 +3871,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -3887,7 +3903,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2:
@@ -3898,6 +3914,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11
@@ -3924,7 +3941,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_3
@@ -3954,7 +3970,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
; GFX908-NEXT: ; %bb.2:
@@ -3965,6 +3981,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB12_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
; GFX908-NEXT: v_min_f16_e32 v4, v4, v10
@@ -3992,7 +4009,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB12_3
@@ -4022,7 +4038,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2:
@@ -4033,6 +4049,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB12_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
; GFX8-NEXT: v_min_f16_e32 v4, v4, v10
@@ -4061,7 +4078,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB12_3
@@ -4089,18 +4105,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v5
; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB12_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
@@ -4130,7 +4147,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB12_3
@@ -4159,18 +4175,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v5
; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
@@ -4200,7 +4218,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB12_3
@@ -4228,55 +4245,55 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4288,54 +4305,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4344,36 +4361,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v4, s4
-; GFX942-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX942-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4385,147 +4402,147 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4533,34 +4550,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4573,40 +4590,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
-; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4614,42 +4631,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f32_e32 v3, v3, v5
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4657,39 +4674,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s4
-; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4698,40 +4715,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s4
-; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -4749,48 +4766,48 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4808,47 +4825,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], null offen
; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4863,36 +4880,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_addk_i32 s16, 0x200
; GFX942-NEXT: s_and_b32 s4, s16, -4
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX942-NEXT: s_and_b32 s4, s16, 3
; GFX942-NEXT: s_lshl_b32 s6, s4, 3
; GFX942-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX942-NEXT: s_not_b32 s7, s4
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX942-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4903,46 +4920,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4955,45 +4972,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v3, s[0:3], 0 offen
; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5006,36 +5023,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_addk_i32 s20, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_and_b32 s4, s20, -4
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_and_b32 s4, s20, 3
; GFX10-NEXT: s_lshl_b32 s4, s4, 3
; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX10-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX10-NEXT: s_not_b32 s6, s5
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v5, s6, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
@@ -5048,34 +5065,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_addk_i32 s20, 0x200
; GFX90A-NEXT: s_and_b32 s4, s20, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen
; GFX90A-NEXT: s_and_b32 s4, s20, 3
; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX90A-NEXT: s_not_b32 s7, s4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5087,35 +5104,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_addk_i32 s20, 0x200
; GFX908-NEXT: s_and_b32 s4, s20, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s4
-; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: v_mov_b32_e32 v3, s4
+; GFX908-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX908-NEXT: s_and_b32 s4, s20, 3
; GFX908-NEXT: s_lshl_b32 s6, s4, 3
; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX908-NEXT: s_not_b32 s7, s4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX908-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v2, v2, v1, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v5, s7, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5127,37 +5144,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_addk_i32 s20, 0x200
; GFX8-NEXT: s_and_b32 s4, s20, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX8-NEXT: s_and_b32 s4, s20, 3
; GFX8-NEXT: s_lshl_b32 s6, s4, 3
; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX8-NEXT: s_not_b32 s7, s4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s7, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5169,34 +5186,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_addk_i32 s20, 0x200
; GFX7-NEXT: s_and_b32 s4, s20, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v3, s4
+; GFX7-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5208,35 +5225,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_addk_i32 s20, 0x200
; GFX6-NEXT: s_and_b32 s4, s20, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s7, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s6, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_mov_b32_e32 v2, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5278,7 +5295,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-TRUE16-NEXT: ; %bb.2:
@@ -5289,26 +5306,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5332,7 +5350,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5373,7 +5390,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], null offen
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-FAKE16-NEXT: ; %bb.2:
@@ -5384,25 +5401,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v10
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5426,7 +5444,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -5459,7 +5476,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX942-NEXT: buffer_load_dword v4, v9, s[4:7], 0 offen
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2:
@@ -5471,6 +5488,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB15_4 Depth 2
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_min_f32_e32 v4, v4, v11
@@ -5504,7 +5522,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB15_3
@@ -5537,7 +5554,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-TRUE16-NEXT: ; %bb.2:
@@ -5549,25 +5566,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5590,7 +5608,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -5627,7 +5644,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: buffer_load_b32 v4, v8, s[4:7], 0 offen
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-FAKE16-NEXT: ; %bb.2:
@@ -5639,24 +5656,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5679,7 +5697,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -5712,7 +5729,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB15_1
@@ -5723,9 +5740,10 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB15_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_min_f32_e32 v4, v4, v10
; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
@@ -5755,7 +5773,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -5788,7 +5805,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v9, s[8:11], 0 offen
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2:
@@ -5800,6 +5817,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11
; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -5830,7 +5848,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_3
@@ -5860,7 +5877,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2:
@@ -5872,6 +5889,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB15_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_min_f32_e32 v4, v4, v10
; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -5903,7 +5921,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB15_3
@@ -5933,7 +5950,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2:
@@ -5944,6 +5961,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB15_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_min_f32_e32 v4, v4, v10
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -5977,7 +5995,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB15_3
@@ -6005,18 +6022,19 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -6047,7 +6065,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB15_3
@@ -6076,18 +6093,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v4, v8, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -6118,7 +6137,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB15_3
@@ -6468,7 +6486,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -6476,16 +6494,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
+; GFX12-NEXT: v_mov_b32_e32 v5, v1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_pk_max_num_f16 v1, v5, v5
+; GFX12-NEXT: v_pk_min_num_f16 v4, v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6499,25 +6518,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s6, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[4:5], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, s6
+; GFX942-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, s6
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v5, v5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX942-NEXT: v_pk_min_f16 v4, v2, v0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6529,24 +6548,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX11-NEXT: v_mov_b32_e32 v5, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_min_f16 v0, v0, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX11-NEXT: v_pk_min_f16 v4, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6560,24 +6580,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX10-NEXT: v_pk_min_f16 v4, v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -6589,23 +6609,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s6, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s6
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX90A-NEXT: v_pk_min_f16 v4, v2, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6619,21 +6639,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s6, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
+; GFX908-NEXT: v_pk_max_f16 v0, v0, v0
; GFX908-NEXT: v_mov_b32_e32 v3, s6
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX908-NEXT: v_pk_min_f16 v0, v0, v2
; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX908-NEXT: v_pk_min_f16 v4, v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6647,25 +6667,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s6, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v5, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v6, v6
+; GFX8-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v5, v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7931,43 +7951,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_min_num_f32 v0, v0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7984,42 +8006,44 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
-; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8033,40 +8057,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s16
-; GFX942-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
; GFX942-NEXT: s_add_i32 s4, s16, 0x400
; GFX942-NEXT: s_mov_b64 s[6:7], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX942-NEXT: s_movk_i32 s8, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX942-NEXT: s_mov_b32 s9, 0x7060302
; GFX942-NEXT: v_mov_b32_e32 v4, s4
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX942-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX942-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX942-NEXT: v_min_f32_e32 v3, v3, v0
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX942-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX942-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX942-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX942-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
; GFX942-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_cbranch_execnz .LBB20_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8076,10 +8100,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -8087,34 +8112,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_min_f32 v0, v0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8127,10 +8153,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s16
; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
@@ -8138,32 +8165,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl1_inv
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8178,38 +8206,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_add_i32 s4, s20, 0x400
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
; GFX10-NEXT: v_mov_b32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX10-NEXT: v_perm_b32 v5, v2, v1, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -8221,39 +8249,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s20
-; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s20, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX90A-NEXT: v_min_f32_e32 v3, v3, v0
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8267,37 +8295,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX908-NEXT: s_add_i32 s4, s20, 0x400
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s4
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX908-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX908-NEXT: v_mov_b32_e32 v6, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX908-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX908-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v1, s8
+; GFX908-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX908-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v2, v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8311,38 +8339,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_min_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
; GFX8-NEXT: v_mov_b32_e32 v6, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v0
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 51398a45055eb..1396099dbfa6a 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -220,19 +220,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0
+; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
@@ -243,31 +246,32 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1
+; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1
; GFX9-O0-NEXT: s_mov_b32 s10, s6
-; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2
+; GFX9-O0-NEXT: v_writelane_b32 v29, s10, 2
; GFX9-O0-NEXT: s_mov_b32 s11, s7
-; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc
+; GFX9-O0-NEXT: v_writelane_b32 v29, s11, 3
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v1, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v13, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v19, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -275,25 +279,25 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5]
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v19, v2, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v13, v2, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20
@@ -424,18 +428,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9]
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr16
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1
; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr13
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
@@ -514,33 +518,33 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 4
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 5
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
@@ -548,67 +552,64 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB0_8
; GFX9-O0-NEXT: .LBB0_1: ; %Flow
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_9
; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 1
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -633,408 +634,408 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_3
; GFX9-O0-NEXT: .LBB0_5: ; %Flow1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 8
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 9
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10
-; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 10
+; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 11
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29
+; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s5, 1
-; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22
-; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3]
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27]
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11]
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25
+; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26
-; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24
+; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4
-; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20
; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17
+; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s5, s8
; GFX9-O0-NEXT: s_mov_b32 s4, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19
-; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 10
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 11
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-O0-NEXT: s_branch .LBB0_1
; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17]
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4
-; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7]
+; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4
+; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4
+; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7]
; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7]
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s8, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s7, s8
; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7
-; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7
+; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 10
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 11
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_6
; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -1047,129 +1048,129 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: s_mov_b32 s4, 64
; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11]
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
-; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
-; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5
; GFX9-O0-NEXT: s_branch .LBB0_7
; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
@@ -1213,7 +1214,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4]
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -1684,29 +1685,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_readlane_b32 s5, v31, 3
; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_nop 0
; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB0_5
; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2
@@ -1792,14 +1790,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_readlane_b32 s4, v31, 4
; GFX9-G-O0-NEXT: v_readlane_b32 s5, v31, 5
; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
@@ -2242,20 +2240,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7]
-; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_nop 0
-; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s10
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_nop 0
-; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -2490,7 +2488,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -2547,16 +2545,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1
+; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v1, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT: v_or_b32_e64 v8, v0, v2
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
@@ -2601,18 +2599,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1
; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
@@ -2695,33 +2693,33 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 2
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
@@ -2729,50 +2727,47 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_8
; GFX9-O0-NEXT: .LBB1_1: ; %Flow
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_5
; GFX9-O0-NEXT: .LBB1_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 2
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 3
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2814,29 +2809,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_3
; GFX9-O0-NEXT: .LBB1_5: ; %Flow1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2854,214 +2849,214 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8
-; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 8
+; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 9
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29
+; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s5, 1
-; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22
-; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3]
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27]
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11]
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25
+; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26
-; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24
+; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4
-; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20
; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17
+; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s5, s8
; GFX9-O0-NEXT: s_mov_b32 s4, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19
-; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 4
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 5
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
@@ -3079,128 +3074,128 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_1
; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17]
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4
-; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7]
+; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4
+; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4
+; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7]
; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7]
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s8, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s7, s8
; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7
-; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7
+; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 8
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 9
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
@@ -3210,12 +3205,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_6
; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -3228,118 +3223,118 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: s_mov_b32 s4, 64
; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11]
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
-; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
-; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5
@@ -3358,7 +3353,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -3768,29 +3763,26 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_readlane_b32 s5, v32, 3
; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-G-O0-NEXT: s_nop 0
; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_branch .LBB1_5
; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2
@@ -3876,14 +3868,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_readlane_b32 s4, v32, 4
; GFX9-G-O0-NEXT: v_readlane_b32 s5, v32, 5
; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
@@ -4339,20 +4331,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5
; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7]
-; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_nop 0
-; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9
; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s10
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_nop 0
-; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..de439c6f46c6e 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -7,37 +7,37 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, v24
; SDAG-NEXT: v_mov_b32_e32 v27, v25
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v2, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v0, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v2, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v21, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v18
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v19
; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT: v_or_b32_e32 v0, v18, v16
; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
+; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1
; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
-; SDAG-NEXT: v_or_b32_e32 v1, v21, v17
+; SDAG-NEXT: v_or_b32_e32 v1, v19, v17
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22
+; SDAG-NEXT: v_min_u32_e32 v2, v21, v2
+; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22
; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT: v_min_u32_e32 v1, v19, v22
+; SDAG-NEXT: v_min_u32_e32 v1, v21, v22
; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc
@@ -46,17 +46,17 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc
; SDAG-NEXT: v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v28
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v28
; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3
; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT: v_min_u32_e32 v8, v8, v19
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v21
; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v1
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v2, v11, v19
+; SDAG-NEXT: v_min_u32_e32 v2, v11, v21
; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8
; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
@@ -66,35 +66,35 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v20, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v20, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
; SDAG-NEXT: v_or_b32_e32 v9, v3, v11
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v21, s[4:5]
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v17, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v19, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, v18, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB0_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2
-; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v2
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[18:19], v20
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
@@ -102,16 +102,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[18:19], v34
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[18:19], v35
; SDAG-NEXT: v_or_b32_e32 v3, v3, v11
; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v21, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v20, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5]
@@ -121,7 +121,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[18:19], v30
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30
; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10
; SDAG-NEXT: v_or_b32_e32 v11, v9, v11
@@ -131,9 +131,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8
; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v9, v19, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v8, v18, s[4:5]
; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30
; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc
@@ -149,30 +149,30 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_or_b32_e32 v19, v17, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v16, v18
+; SDAG-NEXT: v_or_b32_e32 v21, v17, v21
+; SDAG-NEXT: v_or_b32_e32 v20, v16, v20
; SDAG-NEXT: v_or_b32_e32 v16, v22, v38
-; SDAG-NEXT: v_or_b32_e32 v17, v20, v39
+; SDAG-NEXT: v_or_b32_e32 v17, v18, v39
; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v19, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc
; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; SDAG-NEXT: v_and_b32_e32 v20, v8, v29
+; SDAG-NEXT: v_and_b32_e32 v18, v8, v29
; SDAG-NEXT: v_and_b32_e32 v22, v8, v28
; SDAG-NEXT: v_and_b32_e32 v38, v8, v0
; SDAG-NEXT: v_and_b32_e32 v39, v8, v1
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v17, v18
+; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v19, v22, vcc
; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
@@ -194,11 +194,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: .LBB0_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v16
-; SDAG-NEXT: v_or_b32_e32 v18, v11, v1
-; SDAG-NEXT: v_or_b32_e32 v19, v9, v3
+; SDAG-NEXT: v_or_b32_e32 v20, v11, v1
+; SDAG-NEXT: v_or_b32_e32 v21, v9, v3
; SDAG-NEXT: v_or_b32_e32 v22, v10, v0
; SDAG-NEXT: v_or_b32_e32 v23, v8, v2
; SDAG-NEXT: .LBB0_6: ; %Flow16
@@ -208,110 +208,110 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v20, v16
-; SDAG-NEXT: v_mov_b32_e32 v21, v17
+; SDAG-NEXT: v_mov_b32_e32 v18, v16
+; SDAG-NEXT: v_mov_b32_e32 v19, v17
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[4:5]
; SDAG-NEXT: v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v4, v3
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12
-; SDAG-NEXT: v_or_b32_e32 v0, v2, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v3
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12
+; SDAG-NEXT: v_or_b32_e32 v0, v2, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v4
; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v3, v7
+; SDAG-NEXT: v_or_b32_e32 v1, v3, v5
; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v30, v7
-; SDAG-NEXT: v_min_u32_e32 v4, v10, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v30, v5
+; SDAG-NEXT: v_min_u32_e32 v6, v10, v6
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
; SDAG-NEXT: v_min_u32_e32 v1, v9, v30
-; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4
-; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6
+; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9]
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5]
; SDAG-NEXT: v_ffbh_u32_e32 v10, v29
; SDAG-NEXT: v_ffbh_u32_e32 v11, v28
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v4, v29, v0
+; SDAG-NEXT: v_or_b32_e32 v6, v29, v0
; SDAG-NEXT: v_ffbh_u32_e32 v9, v0
; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10
-; SDAG-NEXT: v_or_b32_e32 v5, v28, v1
+; SDAG-NEXT: v_or_b32_e32 v7, v28, v1
; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9
; SDAG-NEXT: v_ffbh_u32_e32 v14, v1
; SDAG-NEXT: v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_min_u32_e32 v4, v9, v14
-; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_min_u32_e32 v6, v9, v14
+; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10
; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc
-; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v12, vcc
+; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v6
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v9, v10
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v9, v5, v11
+; SDAG-NEXT: v_or_b32_e32 v9, v7, v11
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; SDAG-NEXT: v_and_b32_e32 v8, 1, v12
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v4, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4
-; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6
+; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc
; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6
; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[4:5], v34
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35
-; SDAG-NEXT: v_or_b32_e32 v5, v5, v11
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT: v_or_b32_e32 v7, v7, v11
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v10
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -321,24 +321,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: v_mov_b32_e32 v14, 0
; SDAG-NEXT: v_mov_b32_e32 v15, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35
-; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35
+; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v9, v9, v49
; SDAG-NEXT: v_or_b32_e32 v8, v8, v48
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
@@ -346,23 +346,23 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB0_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5
-; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11
; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v6, v6, v8
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v8
; SDAG-NEXT: v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v39
-; SDAG-NEXT: v_or_b32_e32 v5, v13, v5
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v39
+; SDAG-NEXT: v_or_b32_e32 v7, v13, v7
; SDAG-NEXT: v_or_b32_e32 v11, v15, v11
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2
-; SDAG-NEXT: v_or_b32_e32 v4, v12, v4
+; SDAG-NEXT: v_or_b32_e32 v6, v12, v6
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v4, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v5, vcc
; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; SDAG-NEXT: v_and_b32_e32 v15, v8, v29
; SDAG-NEXT: v_and_b32_e32 v38, v8, v28
@@ -370,8 +370,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v48, v8, v1
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v48, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
@@ -390,7 +390,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB0_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11
; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
@@ -402,11 +402,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
-; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20
+; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18
; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16
-; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3
+; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3
; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2
-; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3
+; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3
; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2
; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7
; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 827cb4ac2589a..279c34722e272 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -30,8 +30,8 @@ define amdgpu_ps void @main(i32 %0, float %1) {
; ISA-NEXT: .LBB0_1: ; %Flow1
; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1
; ISA-NEXT: s_or_b64 exec, exec, s[4:5]
-; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: s_mov_b64 s[4:5], s[6:7]
+; ISA-NEXT: s_mov_b64 s[8:9], 0
; ISA-NEXT: .LBB0_2: ; %Flow
; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1
; ISA-NEXT: s_and_b64 s[6:7], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 883063b5471ca..e3bd4e7383598 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -5407,54 +5407,50 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_3
-; GFX90A-NEXT: ; %bb.1: ; %Flow2
-; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_6
-; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB24_4
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_4
-; GFX90A-NEXT: ; %bb.5: ; %Flow
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
+; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: .LBB24_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB24_2
-; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_cbranch_execz .LBB24_6
+; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index c603421ca15b4..25d59a26189c9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -5407,54 +5407,50 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_3
-; GFX90A-NEXT: ; %bb.1: ; %Flow2
-; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_6
-; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB24_4
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9]
+; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_4
-; GFX90A-NEXT: ; %bb.5: ; %Flow
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_2
+; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: .LBB24_4: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB24_2
-; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_cbranch_execz .LBB24_6
+; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 100a560c1d127..afa57b8692aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -711,20 +711,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -738,21 +738,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -892,20 +892,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB4_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -919,21 +919,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB4_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1074,7 +1074,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -1083,15 +1083,15 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB5_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1105,7 +1105,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -1114,16 +1114,16 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB5_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1457,20 +1457,20 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB7_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1484,21 +1484,21 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB7_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2059,20 +2059,20 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2086,21 +2086,21 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2818,20 +2818,20 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2845,21 +2845,21 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3000,20 +3000,20 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3027,21 +3027,21 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3156,20 +3156,20 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3183,21 +3183,21 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3352,20 +3352,20 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3379,21 +3379,21 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3747,20 +3747,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3774,21 +3774,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4141,20 +4141,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4168,21 +4168,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4505,20 +4505,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB23_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4532,21 +4532,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB23_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5189,20 +5189,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5216,21 +5216,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5344,20 +5344,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5371,21 +5371,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5500,7 +5500,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -5509,15 +5509,15 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5531,7 +5531,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -5540,16 +5540,16 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5843,20 +5843,20 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5870,21 +5870,21 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6172,20 +6172,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6199,21 +6199,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB33_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6567,20 +6567,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6594,21 +6594,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB35_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6891,20 +6891,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6918,21 +6918,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB37_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7769,23 +7769,23 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
; GFX7-NEXT: v_mov_b32_e32 v9, v5
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7799,24 +7799,24 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
; GFX6-NEXT: v_mov_b32_e32 v9, v5
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB41_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7975,23 +7975,23 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
; GFX7-NEXT: v_mov_b32_e32 v9, v5
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8005,24 +8005,24 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
; GFX6-NEXT: v_mov_b32_e32 v9, v5
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB42_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8182,7 +8182,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -8191,18 +8191,18 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
; GFX7-NEXT: v_mov_b32_e32 v9, v5
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8216,7 +8216,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -8225,19 +8225,19 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
; GFX6-NEXT: v_mov_b32_e32 v9, v5
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB43_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8597,36 +8597,36 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX7-NEXT: v_not_b32_e32 v7, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -8640,36 +8640,37 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX6-NEXT: v_not_b32_e32 v7, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB44_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -9032,36 +9033,36 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9076,37 +9077,37 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB45_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -9471,36 +9472,36 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB46_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9515,37 +9516,37 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB46_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -9888,28 +9889,28 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_not_b32_e32 v6, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB47_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9929,29 +9930,29 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_not_b32_e32 v6, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB47_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10304,31 +10305,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB48_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10346,32 +10347,32 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB48_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10726,31 +10727,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB49_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10768,32 +10769,32 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB49_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11066,28 +11067,28 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB50_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -11100,29 +11101,29 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB50_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -11379,23 +11380,23 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB51_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11412,24 +11413,24 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB51_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11800,36 +11801,36 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB52_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -11844,37 +11845,37 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB52_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -12229,31 +12230,31 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB53_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12271,32 +12272,32 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB53_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12731,35 +12732,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB54_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -12774,35 +12775,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB54_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -13246,35 +13248,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB55_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -13290,36 +13292,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB55_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -13765,35 +13767,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB56_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -13809,36 +13811,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB56_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -14253,30 +14255,30 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14294,31 +14296,31 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB57_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14752,30 +14754,30 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14794,31 +14796,31 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB58_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15254,30 +15256,30 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15296,31 +15298,31 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB59_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15678,28 +15680,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB60_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -15712,29 +15714,29 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB60_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -16076,23 +16078,23 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16109,24 +16111,24 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB61_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16578,35 +16580,35 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB62_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -16622,36 +16624,36 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB62_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -17087,30 +17089,30 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB63_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17129,31 +17131,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB63_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index faa3ee61427a2..b9774808f1ad1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4806,36 +4806,36 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX7-NEXT: v_not_b32_e32 v7, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4849,36 +4849,37 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX6-NEXT: v_not_b32_e32 v7, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5264,36 +5265,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5308,37 +5309,37 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5726,36 +5727,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5770,37 +5771,37 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -6165,28 +6166,28 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_not_b32_e32 v6, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6206,29 +6207,29 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_not_b32_e32 v6, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6608,31 +6609,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6650,32 +6651,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB30_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7057,31 +7058,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7099,32 +7100,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7415,28 +7416,28 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7449,29 +7450,29 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -7750,23 +7751,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7783,24 +7784,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB33_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8194,36 +8195,36 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -8238,37 +8239,37 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB34_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -8650,31 +8651,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8692,32 +8693,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB35_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9152,36 +9153,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9196,36 +9197,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB36_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -9669,36 +9671,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9714,37 +9716,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB37_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -10190,36 +10192,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -10235,37 +10237,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB38_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -10680,31 +10682,31 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10722,32 +10724,32 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB39_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11181,31 +11183,31 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11224,32 +11226,32 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB40_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11685,31 +11687,31 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11728,32 +11730,32 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB41_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12111,29 +12113,29 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12146,30 +12148,30 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB42_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12511,24 +12513,24 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12545,25 +12547,25 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB43_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13015,36 +13017,36 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -13060,37 +13062,37 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB44_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -13526,31 +13528,31 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13569,32 +13571,32 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB45_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index cb66f85ff3ae2..c30543642d314 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4806,36 +4806,36 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX7-NEXT: v_not_b32_e32 v7, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -4849,36 +4849,37 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX6-NEXT: v_not_b32_e32 v7, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5264,36 +5265,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5308,37 +5309,37 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5726,36 +5727,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5770,37 +5771,37 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -6165,28 +6166,28 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_not_b32_e32 v6, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6206,29 +6207,29 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_not_b32_e32 v6, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6608,31 +6609,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6650,32 +6651,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB30_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7057,31 +7058,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7099,32 +7100,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7415,28 +7416,28 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7449,29 +7450,29 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -7750,23 +7751,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7783,24 +7784,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB33_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8194,36 +8195,36 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -8238,37 +8239,37 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB34_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -8650,31 +8651,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8692,32 +8693,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB35_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9152,36 +9153,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9196,36 +9197,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB36_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -9669,36 +9671,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9714,37 +9716,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB37_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -10190,36 +10192,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -10235,37 +10237,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB38_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -10680,31 +10682,31 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10722,32 +10724,32 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB39_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11181,31 +11183,31 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11224,32 +11226,32 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB40_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11685,31 +11687,31 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11728,32 +11730,32 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB41_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12111,29 +12113,29 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12146,30 +12148,30 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB42_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12511,24 +12513,24 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12545,25 +12547,25 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB43_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13015,36 +13017,36 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -13060,37 +13062,37 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB44_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -13526,31 +13528,31 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13569,32 +13571,32 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB45_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index f869b5778bfb2..5e4a5c649bb24 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -894,20 +894,20 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -921,21 +921,21 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1117,20 +1117,20 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB4_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1144,21 +1144,21 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB4_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1341,7 +1341,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -1350,15 +1350,15 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB5_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1372,7 +1372,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -1381,16 +1381,16 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB5_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1813,20 +1813,20 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB7_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1840,21 +1840,21 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB7_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2749,20 +2749,20 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2776,21 +2776,21 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2972,20 +2972,20 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2999,21 +2999,21 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3196,7 +3196,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -3205,15 +3205,15 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3227,7 +3227,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -3236,16 +3236,16 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3668,20 +3668,20 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3695,21 +3695,21 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_sub_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4663,23 +4663,23 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
; GFX7-NEXT: v_mov_b32_e32 v9, v5
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4693,24 +4693,24 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
; GFX6-NEXT: v_mov_b32_e32 v9, v5
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4895,23 +4895,23 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
; GFX7-NEXT: v_mov_b32_e32 v9, v5
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4925,24 +4925,24 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
; GFX6-NEXT: v_mov_b32_e32 v9, v5
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5128,7 +5128,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -5137,18 +5137,18 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
; GFX7-NEXT: v_mov_b32_e32 v9, v5
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v4, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v7
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5162,7 +5162,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -5171,19 +5171,19 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
; GFX6-NEXT: v_mov_b32_e32 v9, v5
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v4, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v7
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5543,36 +5543,36 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX7-NEXT: v_not_b32_e32 v7, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB22_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -5586,36 +5586,37 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX6-NEXT: v_not_b32_e32 v7, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB22_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5978,36 +5979,36 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB23_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -6022,37 +6023,37 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB23_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -6417,36 +6418,36 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB24_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -6461,37 +6462,37 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB24_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -6834,28 +6835,28 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_not_b32_e32 v6, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v4, v4, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB25_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6875,29 +6876,29 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_not_b32_e32 v6, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v4, v4, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB25_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7250,31 +7251,31 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7292,32 +7293,32 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7672,31 +7673,31 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7714,32 +7715,32 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8012,28 +8013,28 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -8046,29 +8047,29 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -8325,23 +8326,23 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8358,24 +8359,24 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8746,36 +8747,36 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -8790,37 +8791,37 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB30_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -9175,31 +9176,31 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9217,32 +9218,32 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9677,35 +9678,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -9720,35 +9721,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -10192,35 +10194,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -10236,36 +10238,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB33_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -10711,35 +10713,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -10755,36 +10757,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB34_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -11199,30 +11201,30 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_not_b32_e32 v6, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX7-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11240,31 +11242,31 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v3
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_not_b32_e32 v6, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX6-NEXT: v_or_b32_e32 v7, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB35_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11698,30 +11700,30 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11740,31 +11742,31 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB36_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12200,30 +12202,30 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12242,31 +12244,31 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB37_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12624,28 +12626,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -12658,29 +12660,29 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB38_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -13022,23 +13024,23 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13055,24 +13057,24 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB39_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13524,35 +13526,35 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -13568,36 +13570,36 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB40_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -14033,30 +14035,30 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14075,31 +14077,31 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v4, v8, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v7, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v8
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB41_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index f7882e6f12022..b9a37c4b98a80 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -2167,22 +2167,22 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB51_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, v4, v2
-; SI-NEXT: v_not_b32_e32 v3, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, v5, v2
+; SI-NEXT: v_not_b32_e32 v4, v3
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB51_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2245,22 +2245,22 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB52_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, v4, v2
-; SI-NEXT: v_not_b32_e32 v3, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, v5, v2
+; SI-NEXT: v_not_b32_e32 v4, v3
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB52_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2487,38 +2487,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB55_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, s34, v1
-; SI-NEXT: v_not_b32_e32 v0, v0
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, s34, v2
+; SI-NEXT: v_not_b32_e32 v1, v0
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB55_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2578,38 +2578,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB56_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, s34, v1
-; SI-NEXT: v_not_b32_e32 v0, v0
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, s34, v2
+; SI-NEXT: v_not_b32_e32 v1, v0
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB56_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2862,22 +2862,22 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB59_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, v4, v2
-; SI-NEXT: v_not_b32_e32 v3, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, v5, v2
+; SI-NEXT: v_not_b32_e32 v4, v3
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB59_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3950,21 +3950,21 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB83_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB83_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4025,21 +4025,21 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB84_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB84_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4258,37 +4258,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB87_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_i32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB87_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4346,37 +4346,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB88_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_i32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB88_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4627,20 +4627,20 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB91_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_i32_e32 v0, s2, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v1, s2, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB91_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4726,20 +4726,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB92_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_i32_e32 v0, s8, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v1, s8, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB92_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4748,7 +4748,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4840,20 +4840,20 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB93_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_i32_e32 v0, s2, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v1, s2, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB93_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4936,20 +4936,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB94_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_i32_e32 v0, s8, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v1, s8, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB94_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4958,7 +4958,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_max_i32_ret_addr64:
@@ -5041,21 +5041,21 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB95_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_i32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB95_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5203,21 +5203,21 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB97_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB97_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5278,21 +5278,21 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB98_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB98_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5511,37 +5511,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB101_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_u32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB101_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -5599,37 +5599,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB102_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_u32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB102_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -5880,20 +5880,20 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB105_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_u32_e32 v0, s2, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v1, s2, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB105_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5979,20 +5979,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB106_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_u32_e32 v0, s8, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v1, s8, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB106_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6001,7 +6001,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -6094,20 +6094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB107_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_max_u32_e32 v0, s8, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v1, s8, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB107_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6116,7 +6116,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_umax_i32_ret_addr64:
@@ -6199,21 +6199,21 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB108_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_max_u32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB108_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6361,21 +6361,21 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB110_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_u32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB110_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6436,21 +6436,21 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB111_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_u32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB111_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6669,37 +6669,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB114_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_u32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_u32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB114_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -6757,37 +6757,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB115_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_u32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_u32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB115_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7032,21 +7032,21 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB118_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_u32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB118_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7194,21 +7194,21 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB120_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB120_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7269,21 +7269,21 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB121_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB121_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7502,37 +7502,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB124_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_i32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB124_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7590,37 +7590,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v4, s6, 0
-; SI-NEXT: v_writelane_b32 v4, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB125_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_i32_e32 v0, s34, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB125_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v4, 1
-; SI-NEXT: v_readlane_b32 s6, v4, 0
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -7871,20 +7871,20 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB128_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_min_i32_e32 v0, s2, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v1, s2, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB128_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7970,20 +7970,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB129_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_min_i32_e32 v0, s8, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v1, s8, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB129_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7992,7 +7992,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -8080,20 +8080,20 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: .LBB130_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_min_i32_e32 v0, s6, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v1, s6, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execnz .LBB130_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8167,20 +8167,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB131_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_min_i32_e32 v0, s8, v1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v1, s8, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB131_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8189,7 +8189,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_min_i32_ret_addr64:
@@ -8272,21 +8272,21 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB132_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_min_i32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB132_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 59a99a6a0328d..9845064604bc1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -2205,27 +2205,27 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB50_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, v7, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, v6, v2
-; SI-NEXT: v_not_b32_e32 v5, v4
-; SI-NEXT: v_not_b32_e32 v4, v8
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, v9, v3
+; SI-NEXT: v_and_b32_e32 v5, v8, v2
+; SI-NEXT: v_not_b32_e32 v7, v4
+; SI-NEXT: v_not_b32_e32 v6, v5
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB50_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2294,27 +2294,27 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB51_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, v7, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, v6, v2
-; SI-NEXT: v_not_b32_e32 v5, v4
-; SI-NEXT: v_not_b32_e32 v4, v8
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, v9, v3
+; SI-NEXT: v_and_b32_e32 v5, v8, v2
+; SI-NEXT: v_not_b32_e32 v7, v4
+; SI-NEXT: v_not_b32_e32 v6, v5
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB51_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2578,44 +2578,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB54_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, s34, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, s35, v2
-; SI-NEXT: v_not_b32_e32 v1, v0
-; SI-NEXT: v_not_b32_e32 v0, v4
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
; SI-NEXT: v_mov_b32_e32 v5, v1
; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, s34, v5
+; SI-NEXT: v_and_b32_e32 v1, s35, v4
+; SI-NEXT: v_not_b32_e32 v3, v0
+; SI-NEXT: v_not_b32_e32 v2, v1
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB54_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2683,44 +2683,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v8, s6, 0
-; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: .LBB55_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, s34, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, s35, v2
-; SI-NEXT: v_not_b32_e32 v1, v0
-; SI-NEXT: v_not_b32_e32 v0, v4
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
; SI-NEXT: v_mov_b32_e32 v5, v1
; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, s34, v5
+; SI-NEXT: v_and_b32_e32 v1, s35, v4
+; SI-NEXT: v_not_b32_e32 v3, v0
+; SI-NEXT: v_not_b32_e32 v2, v1
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB55_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
-; SI-NEXT: v_readlane_b32 s7, v8, 1
-; SI-NEXT: v_readlane_b32 s6, v8, 0
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3003,27 +3003,27 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB58_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, v7, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, v6, v2
-; SI-NEXT: v_not_b32_e32 v5, v4
-; SI-NEXT: v_not_b32_e32 v4, v8
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, v9, v3
+; SI-NEXT: v_and_b32_e32 v5, v8, v2
+; SI-NEXT: v_not_b32_e32 v7, v4
+; SI-NEXT: v_not_b32_e32 v6, v5
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB58_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4073,26 +4073,26 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB80_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB80_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4159,26 +4159,26 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB81_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB81_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4443,28 +4443,28 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB84_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB84_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4551,28 +4551,28 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB85_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB85_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4878,26 +4878,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_mov_b32_e32 v3, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB88_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB88_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4989,29 +4989,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_mov_b32_e32 v8, s5
-; SI-NEXT: v_mov_b32_e32 v9, s4
+; SI-NEXT: v_mov_b32_e32 v4, s5
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s6
-; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB89_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB89_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5020,7 +5020,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -5119,26 +5119,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_mov_b32_e32 v3, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB90_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB90_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5227,29 +5227,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_mov_b32_e32 v8, s5
-; SI-NEXT: v_mov_b32_e32 v9, s4
+; SI-NEXT: v_mov_b32_e32 v4, s5
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s6
-; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB91_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB91_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5258,7 +5258,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_max_i64_ret_addr64:
@@ -5347,26 +5347,26 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB92_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB92_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5535,26 +5535,26 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB94_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB94_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5621,26 +5621,26 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB95_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB95_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5905,28 +5905,28 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB98_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB98_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6013,28 +6013,28 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB99_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB99_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6340,26 +6340,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_mov_b32_e32 v3, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB102_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB102_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6451,29 +6451,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_mov_b32_e32 v8, s5
-; SI-NEXT: v_mov_b32_e32 v9, s4
+; SI-NEXT: v_mov_b32_e32 v4, s5
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s6
-; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB103_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB103_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6482,7 +6482,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -6577,29 +6577,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_mov_b32_e32 v8, s5
-; SI-NEXT: v_mov_b32_e32 v9, s4
+; SI-NEXT: v_mov_b32_e32 v4, s5
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s6
-; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB104_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB104_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6608,7 +6608,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
@@ -6697,26 +6697,26 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB105_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB105_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6885,26 +6885,26 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB107_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB107_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6971,26 +6971,26 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB108_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB108_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7255,28 +7255,28 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB111_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB111_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7363,28 +7363,28 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB112_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB112_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7683,26 +7683,26 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB115_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB115_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7871,26 +7871,26 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB117_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB117_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7957,26 +7957,26 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB118_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB118_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8241,28 +8241,28 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB121_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB121_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8349,28 +8349,28 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
; SI-NEXT: s_mov_b64 s[36:37], 0
; SI-NEXT: v_mov_b32_e32 v4, s35
; SI-NEXT: v_mov_b32_e32 v5, s34
; SI-NEXT: .LBB122_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
; SI-NEXT: s_cbranch_execnz .LBB122_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8676,26 +8676,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_mov_b32_e32 v3, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: .LBB125_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB125_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8787,29 +8787,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_mov_b32_e32 v8, s5
-; SI-NEXT: v_mov_b32_e32 v9, s4
+; SI-NEXT: v_mov_b32_e32 v4, s5
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s6
-; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB126_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB126_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8818,7 +8818,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -8913,28 +8913,28 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s4
-; SI-NEXT: v_mov_b32_e32 v3, s5
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: .LBB127_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v9, v3
-; SI-NEXT: v_mov_b32_e32 v8, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v2, v6
-; SI-NEXT: v_mov_b32_e32 v3, v7
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB127_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9014,29 +9014,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_mov_b32_e32 v8, s5
-; SI-NEXT: v_mov_b32_e32 v9, s4
+; SI-NEXT: v_mov_b32_e32 v4, s5
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s6
-; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: .LBB128_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v3
-; SI-NEXT: v_mov_b32_e32 v6, v2
-; SI-NEXT: v_mov_b32_e32 v5, v1
-; SI-NEXT: v_mov_b32_e32 v4, v0
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, v4
-; SI-NEXT: v_mov_b32_e32 v3, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; SI-NEXT: s_cbranch_execnz .LBB128_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9045,7 +9045,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: atomic_min_i64_ret_addr64:
@@ -9134,26 +9134,26 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: .LBB129_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
; SI-NEXT: v_mov_b32_e32 v9, v5
; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
+; SI-NEXT: v_mov_b32_e32 v4, v6
+; SI-NEXT: v_mov_b32_e32 v5, v7
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
; SI-NEXT: s_cbranch_execnz .LBB129_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 82c58394c03bb..d17498a933be3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -34,19 +34,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-NEXT: .LBB0_3:
@@ -205,19 +205,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-DPP-NEXT: .LBB0_3:
@@ -416,20 +416,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
; GFX7LESS-NEXT: .LBB1_5:
@@ -762,20 +762,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1163,19 +1163,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-NEXT: .LBB2_3:
@@ -1383,19 +1383,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-DPP-NEXT: .LBB2_3:
@@ -1634,20 +1634,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
; GFX7LESS-NEXT: .LBB3_5:
@@ -1980,20 +1980,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2381,19 +2381,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-NEXT: .LBB4_3:
@@ -2631,19 +2631,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-DPP-NEXT: .LBB4_3:
@@ -2912,20 +2912,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
; GFX7LESS-NEXT: .LBB5_5:
@@ -3258,20 +3258,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3686,20 +3686,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4
; GFX7LESS-NEXT: .LBB6_5:
@@ -4032,20 +4032,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4433,19 +4433,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
; GFX7LESS-NEXT: .LBB7_3:
@@ -4683,19 +4683,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX7LESS-DPP-NEXT: .LBB7_3:
@@ -4963,20 +4963,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4
; GFX7LESS-NEXT: .LBB8_5:
@@ -5335,20 +5335,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8004,23 +8004,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
; GFX7LESS-NEXT: .LBB11_3:
@@ -8261,23 +8261,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX7LESS-DPP-NEXT: .LBB11_3:
@@ -8551,23 +8551,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4
; GFX7LESS-NEXT: .LBB12_5:
@@ -8941,23 +8941,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9437,23 +9437,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-NEXT: .LBB13_3:
@@ -9694,23 +9694,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-DPP-NEXT: .LBB13_3:
@@ -9984,23 +9984,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4
; GFX7LESS-NEXT: .LBB14_5:
@@ -10374,23 +10374,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10899,23 +10899,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4
; GFX7LESS-NEXT: .LBB15_5:
@@ -11289,23 +11289,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14051,19 +14051,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2
; GFX7LESS-NEXT: .LBB18_3:
@@ -14222,19 +14222,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX7LESS-DPP-NEXT: .LBB18_3:
@@ -14397,19 +14397,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2
; GFX7LESS-NEXT: .LBB19_3:
@@ -14568,19 +14568,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX7LESS-DPP-NEXT: .LBB19_3:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f8f911b693e09..cc2f490cc1ff0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -30,20 +30,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-NEXT: .LBB0_3:
@@ -155,20 +155,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-DPP-NEXT: .LBB0_3:
@@ -301,18 +301,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, v1, v2
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -326,22 +326,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1
; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7LESS-NEXT: v_max_f32_e32 v3, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
; GFX7LESS-NEXT: .LBB1_5:
@@ -666,20 +666,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v2, v1, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1068,20 +1068,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-NEXT: .LBB2_3:
@@ -1193,20 +1193,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-DPP-NEXT: .LBB2_3:
@@ -1340,18 +1340,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, v1, v2
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -1365,22 +1365,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1
; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7LESS-NEXT: v_max_f32_e32 v3, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
; GFX7LESS-NEXT: .LBB3_5:
@@ -1705,20 +1705,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v2, v1, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2108,20 +2108,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-NEXT: .LBB4_3:
@@ -2233,20 +2233,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-DPP-NEXT: .LBB4_3:
@@ -2379,18 +2379,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, v1, v2
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -2404,22 +2404,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1
; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7LESS-NEXT: v_max_f32_e32 v3, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
; GFX7LESS-NEXT: .LBB5_5:
@@ -2744,20 +2744,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v2, v1, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4275,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4300,26 +4300,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1064-NEXT: s_mov_b64 s[54:55], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53]
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53]
; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s52
-; GFX1064-NEXT: v_mov_b32_e32 v3, s53
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
@@ -4331,16 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s13, s50
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s53
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0
-; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55]
@@ -5424,24 +5424,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], 4.0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2
; GFX7LESS-NEXT: .LBB8_3:
@@ -5590,24 +5590,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX7LESS-DPP-NEXT: .LBB8_3:
@@ -5777,12 +5777,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
@@ -5790,7 +5790,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -5804,25 +5804,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX7LESS-NEXT: v_max_f64 v[6:7], v[0:1], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4
; GFX7LESS-NEXT: .LBB9_5:
@@ -6191,23 +6191,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7821,20 +7821,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7846,26 +7846,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1064-NEXT: s_mov_b64 s[54:55], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53]
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53]
; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s52
-; GFX1064-NEXT: v_mov_b32_e32 v3, s53
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
@@ -7877,16 +7877,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s13, s50
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s53
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0
-; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55]
@@ -8970,20 +8970,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2
; GFX7LESS-NEXT: .LBB12_3:
@@ -9095,20 +9095,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX7LESS-DPP-NEXT: .LBB12_3:
@@ -9224,20 +9224,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-NEXT: .LBB13_3:
@@ -9349,20 +9349,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_max_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-DPP-NEXT: .LBB13_3:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1f76a476107a3..b3d81b5e9aec8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -30,20 +30,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-NEXT: .LBB0_3:
@@ -155,20 +155,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-DPP-NEXT: .LBB0_3:
@@ -301,18 +301,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, v1, v2
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -326,22 +326,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1
; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7LESS-NEXT: v_min_f32_e32 v3, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
; GFX7LESS-NEXT: .LBB1_5:
@@ -666,20 +666,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v2, v1, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1068,20 +1068,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-NEXT: .LBB2_3:
@@ -1193,20 +1193,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-DPP-NEXT: .LBB2_3:
@@ -1340,18 +1340,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, v1, v2
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -1365,22 +1365,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1
; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7LESS-NEXT: v_min_f32_e32 v3, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
; GFX7LESS-NEXT: .LBB3_5:
@@ -1705,20 +1705,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v2, v1, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2108,20 +2108,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-NEXT: .LBB4_3:
@@ -2233,20 +2233,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-DPP-NEXT: .LBB4_3:
@@ -2379,18 +2379,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX7LESS-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, v1, v2
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -2404,22 +2404,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v1
; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7LESS-NEXT: v_min_f32_e32 v3, v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
; GFX7LESS-NEXT: .LBB5_5:
@@ -2744,20 +2744,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v2, v1, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4275,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4300,26 +4300,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1064-NEXT: s_mov_b64 s[54:55], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53]
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53]
; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s52
-; GFX1064-NEXT: v_mov_b32_e32 v3, s53
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
@@ -4331,16 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: s_mov_b32 s13, s50
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s53
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0
-; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55]
@@ -5424,24 +5424,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_min_f64 v[2:3], v[0:1], 4.0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2
; GFX7LESS-NEXT: .LBB8_3:
@@ -5590,24 +5590,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s7
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX7LESS-DPP-NEXT: .LBB8_3:
@@ -5777,12 +5777,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000
; GFX7LESS-NEXT: .LBB9_1: ; %ComputeLoop
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
@@ -5790,7 +5790,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX7LESS-NEXT: s_cbranch_vccnz .LBB9_1
; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -5804,25 +5804,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX7LESS-NEXT: v_min_f64 v[6:7], v[0:1], v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_4
; GFX7LESS-NEXT: .LBB9_5:
@@ -6191,23 +6191,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[6:7]
+; GFX7LESS-DPP-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7821,20 +7821,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_movk_i32 s32, 0x800
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000
+; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4]
; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7846,26 +7846,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[52:53], s[34:35], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4]
; GFX1064-NEXT: s_mov_b64 s[54:55], 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[52:53]
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[52:53]
; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
; GFX1064-NEXT: s_add_u32 s8, s34, 44
; GFX1064-NEXT: s_addc_u32 s9, s35, 0
; GFX1064-NEXT: s_getpc_b64 s[0:1]
; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT: buffer_store_dword v5, off, s[64:67], 0 offset:4
-; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0
; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_mov_b32_e32 v2, s52
-; GFX1064-NEXT: v_mov_b32_e32 v3, s53
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: v_mov_b32_e32 v5, 8
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: v_mov_b32_e32 v7, 0
@@ -7877,16 +7877,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: s_mov_b32 s13, s50
; GFX1064-NEXT: s_mov_b32 s14, s33
; GFX1064-NEXT: s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1064-NEXT: buffer_store_dword v1, off, s[64:67], 0 offset:12
-; GFX1064-NEXT: buffer_store_dword v0, off, s[64:67], 0 offset:8
-; GFX1064-NEXT: v_mov_b32_e32 v0, 8
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[64:67], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[64:67], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s53
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: buffer_load_dword v4, off, s[64:67], 0
-; GFX1064-NEXT: buffer_load_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT: buffer_load_dword v1, off, s[64:67], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[64:67], 0 offset:4
; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: s_or_b64 s[54:55], vcc, s[54:55]
@@ -8970,20 +8970,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2
; GFX7LESS-NEXT: .LBB12_3:
@@ -9095,20 +9095,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX7LESS-DPP-NEXT: .LBB12_3:
@@ -9224,20 +9224,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-NEXT: .LBB13_3:
@@ -9349,20 +9349,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-DPP-NEXT: v_min_f32_e32 v1, 4.0, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-DPP-NEXT: .LBB13_3:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 9db3c37045ccf..b6fa8d7e32199 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -34,19 +34,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-NEXT: .LBB0_3:
@@ -235,19 +235,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s6
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX7LESS-DPP-NEXT: .LBB0_3:
@@ -476,20 +476,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
; GFX7LESS-NEXT: .LBB1_5:
@@ -848,20 +848,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1275,19 +1275,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-NEXT: .LBB2_3:
@@ -1525,19 +1525,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX7LESS-DPP-NEXT: .LBB2_3:
@@ -1806,20 +1806,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
; GFX7LESS-NEXT: .LBB3_5:
@@ -2178,20 +2178,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2605,19 +2605,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-NEXT: .LBB4_3:
@@ -2855,19 +2855,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX7LESS-DPP-NEXT: .LBB4_3:
@@ -3136,20 +3136,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
; GFX7LESS-NEXT: .LBB5_5:
@@ -3508,20 +3508,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3962,20 +3962,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4
; GFX7LESS-NEXT: .LBB6_5:
@@ -4334,20 +4334,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4761,19 +4761,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
; GFX7LESS-NEXT: .LBB7_3:
@@ -5011,19 +5011,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX7LESS-DPP-NEXT: .LBB7_3:
@@ -5291,20 +5291,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: v_sub_f32_e32 v3, v4, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v4
+; GFX7LESS-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4
; GFX7LESS-NEXT: .LBB8_5:
@@ -5663,20 +5663,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v1, v2, v0
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_sub_f32_e32 v2, v3, v0
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v2
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8332,23 +8332,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
; GFX7LESS-NEXT: .LBB11_3:
@@ -8589,23 +8589,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX7LESS-DPP-NEXT: .LBB11_3:
@@ -8878,23 +8878,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4
; GFX7LESS-NEXT: .LBB12_5:
@@ -9268,23 +9268,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9764,23 +9764,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-NEXT: .LBB13_3:
@@ -10021,23 +10021,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s9
; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-DPP-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX7LESS-DPP-NEXT: .LBB13_3:
@@ -10311,23 +10311,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4
; GFX7LESS-NEXT: .LBB14_5:
@@ -10701,23 +10701,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11226,23 +11226,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5]
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, v7
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4
; GFX7LESS-NEXT: .LBB15_5:
@@ -11616,23 +11616,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
-; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
-; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
-; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
-; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1]
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1
; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index b5665835eaf7a..3d9fff23107b0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -5863,12 +5863,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; NOOPT-NEXT: v_mov_b32_e32 v13, s13
; NOOPT-NEXT: v_mov_b32_e32 v14, s14
; NOOPT-NEXT: v_mov_b32_e32 v15, s15
-; NOOPT-NEXT: s_mov_b64 s[0:1], exec
-; NOOPT-NEXT: v_writelane_b32 v32, s0, 5
-; NOOPT-NEXT: v_writelane_b32 v32, s1, 6
-; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
@@ -5885,6 +5879,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v32, s0, 5
+; NOOPT-NEXT: v_writelane_b32 v32, s1, 6
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
@@ -5903,19 +5903,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(6)
; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(5)
; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(4)
; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(3)
; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(2)
; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(1)
; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
@@ -9009,27 +9002,26 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; GENERIC-LABEL: broken_phi_bb:
; GENERIC: ; %bb.0: ; %bb
; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT: s_mov_b32 s6, 8
+; GENERIC-NEXT: s_mov_b32 s4, 8
; GENERIC-NEXT: s_mov_b32 s3, 0xf000
; GENERIC-NEXT: s_mov_b32 s2, -1
; GENERIC-NEXT: s_branch .LBB26_2
; GENERIC-NEXT: .LBB26_1: ; %Flow
; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1
; GENERIC-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GENERIC-NEXT: s_mov_b32 s4, s1
; GENERIC-NEXT: s_cbranch_vccz .LBB26_4
; GENERIC-NEXT: .LBB26_2: ; %bb2
; GENERIC-NEXT: ; =>This Inner Loop Header: Depth=1
; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT: s_cmp_ge_i32 s6, s0
+; GENERIC-NEXT: s_cmp_ge_i32 s4, s0
; GENERIC-NEXT: s_mov_b64 s[4:5], -1
-; GENERIC-NEXT: ; implicit-def: $sgpr6
; GENERIC-NEXT: s_cbranch_scc1 .LBB26_1
; GENERIC-NEXT: ; %bb.3: ; %bb4
; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1
; GENERIC-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; GENERIC-NEXT: s_waitcnt vmcnt(0)
; GENERIC-NEXT: s_mov_b64 s[4:5], 0
-; GENERIC-NEXT: s_mov_b32 s6, s1
; GENERIC-NEXT: s_branch .LBB26_1
; GENERIC-NEXT: .LBB26_4: ; %bb8
; GENERIC-NEXT: s_endpgm
@@ -9070,8 +9062,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, s2
; NOOPT-NEXT: v_mov_b32_e32 v0, s4
-; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3]
; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3]
; NOOPT-NEXT: v_writelane_b32 v18, s0, 2
; NOOPT-NEXT: v_writelane_b32 v18, s1, 3
; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
@@ -9117,30 +9109,30 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; NOOPT-NEXT: v_mov_b32_e32 v13, s17
; NOOPT-NEXT: v_mov_b32_e32 v14, s18
; NOOPT-NEXT: v_mov_b32_e32 v15, s19
-; NOOPT-NEXT: v_mov_b32_e32 v16, s0
-; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: v_mov_b32_e32 v0, s0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
; NOOPT-NEXT: s_mov_b64 s[0:1], exec
; NOOPT-NEXT: v_writelane_b32 v18, s0, 4
; NOOPT-NEXT: v_writelane_b32 v18, s1, 5
; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
; NOOPT-NEXT: buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
-; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1
; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2
@@ -9151,30 +9143,23 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: v_readlane_b32 s0, v18, 6
; NOOPT-NEXT: v_readlane_b32 s1, v18, 7
-; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(6)
-; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(5)
-; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(4)
-; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(3)
-; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(2)
-; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(1)
-; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_waitcnt expcnt(0)
-; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
@@ -9198,22 +9183,22 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill
; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill
; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
-; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
; NOOPT-NEXT: v_writelane_b32 v18, s2, 6
; NOOPT-NEXT: v_writelane_b32 v18, s3, 7
@@ -9251,9 +9236,9 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
-; NOOPT-NEXT: s_mov_b64 s[0:1], 0
; NOOPT-NEXT: s_waitcnt vmcnt(14)
; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], 0
; NOOPT-NEXT: v_writelane_b32 v18, s0, 2
; NOOPT-NEXT: v_writelane_b32 v18, s1, 3
; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
index fab5d386446d3..044fb2abd5cc1 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
@@ -28,29 +28,29 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1
; GCN-NEXT: ; %bb.1: ; %atomic
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400
+; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:400
; GCN-NEXT: s_load_dword s4, s[4:5], 0xf
; GCN-NEXT: s_mov_b64 s[2:3], 0
; GCN-NEXT: .LBB0_2: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_max_i32_e32 v3, s4, v4
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, v3
-; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc
+; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_max_i32_e32 v4, s4, v5
+; GCN-NEXT: v_mov_b32_e32 v3, v4
+; GCN-NEXT: v_mov_b32_e32 v4, v5
+; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v4, v5
; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GCN-NEXT: s_cbranch_execnz .LBB0_2
; GCN-NEXT: ; %bb.3: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GCN-NEXT: .LBB0_4: ; %exit
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -87,22 +87,22 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: ; %bb.1: ; %atomic
; GCN-NEXT: s_mov_b32 s0, s2
; GCN-NEXT: s_mov_b32 s1, s2
-; GCN-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:400
+; GCN-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 offset:400
; GCN-NEXT: s_load_dword s6, s[4:5], 0xf
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: .LBB1_2: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_max_i32_e32 v3, s6, v4
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v6, v4
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, v3
-; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[0:3], 0 addr64 offset:400 glc
+; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_max_i32_e32 v4, s6, v5
+; GCN-NEXT: v_mov_b32_e32 v3, v4
+; GCN-NEXT: v_mov_b32_e32 v4, v5
+; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[0:3], 0 addr64 offset:400 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v4, v5
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB1_2
; GCN-NEXT: .LBB1_3: ; %exit
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b5e7589cbd134..b1e6f47fdf2c7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2556,32 +2556,32 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X,
-; EG-NEXT: MOV * T1.W, literal.x,
+; EG-NEXT: OR_INT T1.W, KC0[2].W, KC0[3].X,
+; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
-; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0,
+; EG-NEXT: SETNE_INT * T1.W, PV.W, 0.0,
; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
; EG-NEXT: ALU clause starting at 19:
-; EG-NEXT: MOV T0.W, KC0[2].W,
-; EG-NEXT: MOV * T1.W, KC0[3].Z,
+; EG-NEXT: MOV T1.W, KC0[2].W,
+; EG-NEXT: MOV * T0.W, KC0[3].Z,
; EG-NEXT: MOV T2.W, KC0[3].Y,
; EG-NEXT: MULLO_INT * T0.X, PV.W, PS,
-; EG-NEXT: MOV T1.W, KC0[3].X,
-; EG-NEXT: MULHI * T0.Y, T0.W, PV.W,
+; EG-NEXT: MOV T0.W, KC0[3].X,
+; EG-NEXT: MULHI * T0.Y, T1.W, PV.W,
; EG-NEXT: ADD_INT T3.W, PS, T0.X,
; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W,
; EG-NEXT: ADD_INT T0.Y, PV.W, PS,
-; EG-NEXT: MOV T1.W, literal.x,
-; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W,
+; EG-NEXT: MOV T0.W, literal.x,
+; EG-NEXT: MULLO_INT * T0.X, T1.W, T2.W,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 31:
-; EG-NEXT: MOV T0.W, KC0[2].Y,
-; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0,
+; EG-NEXT: MOV T1.W, KC0[2].Y,
+; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0,
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
; EG-NEXT: ALU clause starting at 34:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 35:
-; EG-NEXT: LSHR * T1.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T1.X, T1.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = icmp eq i64 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6512bee36e88b..d29a7a2dc5656 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -240,27 +240,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
@@ -270,34 +274,34 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12
-; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3]
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1
+; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1
; GFX9-O0-NEXT: s_mov_b32 s10, s6
-; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2
+; GFX9-O0-NEXT: v_writelane_b32 v29, s10, 2
; GFX9-O0-NEXT: s_mov_b32 s11, s7
-; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v2
+; GFX9-O0-NEXT: v_writelane_b32 v29, s11, 3
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v1, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v2, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
@@ -305,25 +309,25 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5]
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5]
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17
@@ -462,18 +466,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9]
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr16
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1
; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr13
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
@@ -552,33 +556,33 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 4
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 5
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
@@ -586,67 +590,64 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB0_8
; GFX9-O0-NEXT: .LBB0_1: ; %Flow
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_9
; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 1
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -671,408 +672,408 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_3
; GFX9-O0-NEXT: .LBB0_5: ; %Flow1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 8
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 9
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10
-; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 10
+; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 11
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29
+; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s5, 1
-; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22
-; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3]
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27]
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11]
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25
+; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26
-; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24
+; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4
-; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20
; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17
+; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s5, s8
; GFX9-O0-NEXT: s_mov_b32 s4, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19
-; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 10
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 11
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-O0-NEXT: s_branch .LBB0_1
; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17]
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4
-; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7]
+; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4
+; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4
+; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7]
; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7]
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s8, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s7, s8
; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7
-; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7
+; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 10
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 11
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_6
; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -1085,118 +1086,118 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: s_mov_b32 s4, 64
; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11]
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
-; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
-; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5
@@ -1212,10 +1213,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 32
@@ -1489,7 +1490,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4]
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -1703,7 +1704,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
@@ -1780,16 +1781,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1
+; GFX9-O0-NEXT: ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 0
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 1
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v1, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT: v_or_b32_e64 v8, v0, v2
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
@@ -1834,18 +1835,18 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v3
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
+; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v1
; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
@@ -1928,33 +1929,33 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 2
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
@@ -1962,50 +1963,47 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_8
; GFX9-O0-NEXT: .LBB1_1: ; %Flow
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_5
; GFX9-O0-NEXT: .LBB1_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 2
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 3
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2047,29 +2045,29 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB1_3
; GFX9-O0-NEXT: .LBB1_5: ; %Flow1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6
-; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7
+; GFX9-O0-NEXT: v_readlane_b32 s4, v29, 6
+; GFX9-O0-NEXT: v_readlane_b32 s5, v29, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2087,214 +2085,214 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8
-; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_readlane_b32 s6, v29, 8
+; GFX9-O0-NEXT: v_readlane_b32 s7, v29, 9
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v29
+; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7
; GFX9-O0-NEXT: s_mov_b32 s5, 1
-; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s5, v[22:23]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22
-; GFX9-O0-NEXT: v_or_b32_e64 v22, v5, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[2:3]
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s4, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v29
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[26:27]
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v27
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v26
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[2:3]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v27
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v28
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v26
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT: v_lshlrev_b64 v[28:29], s5, v[6:7]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[26:27], s5, v[10:11]
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v29
-; GFX9-O0-NEXT: s_waitcnt vmcnt(10)
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT: v_or3_b32 v6, v6, v7, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v25
+; GFX9-O0-NEXT: v_or3_b32 v10, v10, v11, v28
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v28
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v26
-; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v24
+; GFX9-O0-NEXT: v_or3_b32 v0, v0, v1, v11
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v25
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v24
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16
; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v22
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, v7, v10
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s4, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 1
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_and_b32_e64 v12, v7, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
+; GFX9-O0-NEXT: v_and_b32_e64 v4, v15, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT: v_and_b32_e64 v14, v11, s4
-; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-O0-NEXT: v_and_b32_e64 v6, v17, s4
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v20, v11, v20
+; GFX9-O0-NEXT: v_and_b32_e64 v22, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v20, v17, v20
; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v22
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, v22
-; GFX9-O0-NEXT: v_and_b32_e64 v22, v11, v18
-; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v22
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v21
-; GFX9-O0-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v19
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX9-O0-NEXT: v_and_b32_e64 v15, v15, v22
+; GFX9-O0-NEXT: v_and_b32_e64 v17, v17, v18
+; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17
+; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v20
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v21
+; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, v10, v19
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s5, s8
; GFX9-O0-NEXT: s_mov_b32 s4, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v19, vcc, v11, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v19
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v20
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v18, v18, v21
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19
-; GFX9-O0-NEXT: v_or_b32_e64 v16, v16, v17
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v18
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 4
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 5
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 4
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 5
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 8
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 9
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -2312,128 +2310,128 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_1
; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], v4, v[18:19]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], v8, v[18:19]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15
; GFX9-O0-NEXT: s_mov_b32 s4, 64
-; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v4
-; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX9-O0-NEXT: v_sub_u32_e64 v20, s4, v8
+; GFX9-O0-NEXT: v_lshlrev_b64 v[20:21], v20, v[16:17]
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v22
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20
-; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v7
-; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v4, s4
-; GFX9-O0-NEXT: v_sub_u32_e64 v5, v4, s4
-; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v5, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v21
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[6:7]
+; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v22
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v20
+; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v15
+; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, s4
+; GFX9-O0-NEXT: v_sub_u32_e64 v11, v8, s4
+; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], v11, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v21
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[6:7]
; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, s4
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v22, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v20
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v11, v22, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v20
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], v8, v[16:17]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v17
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[6:7]
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s8, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s8
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v16, v11, v16, s[6:7]
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v10
; GFX9-O0-NEXT: s_mov_b64 s[8:9], -1
; GFX9-O0-NEXT: s_mov_b32 s7, s8
; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s7
-; GFX9-O0-NEXT: v_add_co_u32_e32 v16, vcc, v15, v16
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s7
+; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s7
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
+; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8
-; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_writelane_b32 v29, s4, 8
+; GFX9-O0-NEXT: v_writelane_b32 v29, s5, 9
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
@@ -2443,12 +2441,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_branch .LBB1_6
; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -2461,118 +2459,118 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
; GFX9-O0-NEXT: s_mov_b32 s8, s6
; GFX9-O0-NEXT: s_mov_b32 s9, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT: v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX9-O0-NEXT: v_add_co_u32_e32 v7, vcc, v3, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v8
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
+; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], v2, v[11:12]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: s_mov_b32 s4, 64
; GFX9-O0-NEXT: v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[6:7]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], v13, v[9:10]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v15
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13
-; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v15
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v2, s4
; GFX9-O0-NEXT: s_mov_b32 s10, 63
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[12:13], v3, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5]
+; GFX9-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[4:5]
; GFX9-O0-NEXT: s_mov_b32 s10, 0
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[10:11]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[10:11]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v13
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[10:11]
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], v2, v[9:10]
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
-; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
-; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
-; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s7
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6
-; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7
+; GFX9-O0-NEXT: v_writelane_b32 v29, s6, 6
+; GFX9-O0-NEXT: v_writelane_b32 v29, s7, 7
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5
@@ -2829,7 +2827,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4]
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index df496258a2509..0981584598abd 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -369,12 +369,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v18, v2, v3
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6
; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7
-; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11
+; GCN-IR-NEXT: v_min_u32_e32 v19, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v18, v19
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
@@ -399,47 +399,47 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v4, v10
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[6:7], v8
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v4, v11
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_not_b32_e32 v4, v18
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[6:7], v8
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v4, v19
+; GCN-IR-NEXT: v_addc_u32_e64 v7, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v18, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v18
+; GCN-IR-NEXT: v_and_b32_e32 v19, v18, v1
+; GCN-IR-NEXT: v_and_b32_e32 v18, v18, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v18
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v19, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0
@@ -1420,9 +1420,9 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1444,46 +1444,46 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB11_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v8
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v16
+; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v1
+; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v16
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v17, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB11_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB11_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0
@@ -1613,9 +1613,9 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1631,54 +1631,54 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v8
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v16
+; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v1
+; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v16
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v17, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB12_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0
@@ -1715,8 +1715,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4
; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0
; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5
-; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1
-; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v8
+; GCN-IR-NEXT: v_min_u32_e32 v12, v0, v1
+; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v12
; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1]
@@ -1738,44 +1738,44 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB13_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v6
-; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[4:5], v6
+; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v12
+; GCN-IR-NEXT: v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v2, 31, v1
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v2
-; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s12, v6
-; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v2
+; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s10, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v4
-; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2
; GCN-IR-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
+; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v2
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v3
-; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v2
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v12
+; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v12
+; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1
+; GCN-IR-NEXT: v_or_b32_e32 v0, v6, v0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, v3
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v12
+; GCN-IR-NEXT: v_mov_b32_e32 v6, v2
+; GCN-IR-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB13_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB13_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; GCN-IR-NEXT: v_or_b32_e32 v3, v3, v1
; GCN-IR-NEXT: v_or_b32_e32 v2, v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 47dfa9f4fc2d3..a5cb9a507bd89 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -346,12 +346,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13
+; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v10, v11
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
@@ -375,47 +375,47 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v6, v12
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13
-; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT: v_not_b32_e32 v6, v10
+; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v8
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v11
+; GCN-IR-NEXT: v_addc_u32_e64 v9, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v12
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v13, vcc
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v18, 31, v6
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
-; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v18
+; GCN-IR-NEXT: v_and_b32_e32 v19, v18, v3
+; GCN-IR-NEXT: v_and_b32_e32 v18, v18, v2
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v18
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
@@ -1538,9 +1538,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1561,46 +1561,46 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB11_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v8
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14
+; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1
+; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB11_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB11_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -1729,9 +1729,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1746,54 +1746,54 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB12_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v8
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14
+; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1
+; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB12_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -1836,8 +1836,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v8
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
@@ -1859,44 +1859,44 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB13_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v6
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT: v_addc_u32_e64 v7, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14
+; GCN-IR-NEXT: v_and_b32_e32 v14, 0x8000, v14
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB13_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB13_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index e9017939f8a4a..f1d7c84836ca7 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -341,47 +341,47 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v10
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc
; GCN-IR-NEXT: v_not_b32_e32 v0, v14
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v15
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14
+; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3
+; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2
+; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5
+; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v7
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1
; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0
@@ -1065,8 +1065,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v10
+; GCN-IR-NEXT: v_min_u32_e32 v14, v2, v3
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v14
; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7]
@@ -1081,54 +1081,54 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB9_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB9_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v14
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v8
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14
+; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1
+; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB9_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
@@ -1177,44 +1177,44 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB10_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v8
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v6
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
+; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v10
+; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB10_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB10_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
@@ -1377,43 +1377,43 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB12_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v8
+; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: v_addc_u32_e64 v1, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v6, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v8
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8
-; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT: v_and_b32_e32 v10, 24, v10
+; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v6, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v7, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v8, v10
+; GCN-IR-NEXT: v_mov_b32_e32 v6, v4
+; GCN-IR-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB12_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB12_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1
; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 6480a88d40f5a..bd742968ba37b 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -322,12 +322,12 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5
+; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5
; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0
; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4
; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5
-; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13
+; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5
+; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v10, v11
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
@@ -350,47 +350,47 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB1_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v6, v12
-; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v8
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v13
-; GCN-IR-NEXT: v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT: v_not_b32_e32 v6, v10
+; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v8
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v11
+; GCN-IR-NEXT: v_addc_u32_e64 v9, s[8:9], -1, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0
; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10
-; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12
+; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8
-; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5
-; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3
-; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v6
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12
-; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v12, v6
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v16
+; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v3
+; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v2
+; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT: v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v12, vcc, v12, v16
+; GCN-IR-NEXT: v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT: v_subb_u32_e32 v13, vcc, v13, v17, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB1_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB1_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5
; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4
@@ -1166,8 +1166,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v10
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v8
; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1182,54 +1182,54 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
; GCN-IR-NEXT: s_cbranch_execz .LBB8_6
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000
; GCN-IR-NEXT: v_mov_b32_e32 v4, 0
; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB8_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0
; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v6
-; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[8:9], v6
+; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v8
+; GCN-IR-NEXT: v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1
-; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v14
+; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v1
+; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v0
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v11, v15, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB8_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB8_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
@@ -1262,8 +1262,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0
; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2
; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10
+; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v8
; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
@@ -1284,44 +1284,44 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-IR-NEXT: s_cbranch_execz .LBB9_5
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6
-; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
-; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff
+; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v6
+; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT: v_addc_u32_e64 v7, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
+; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT: s_movk_i32 s10, 0x7fff
; GCN-IR-NEXT: v_mov_b32_e32 v5, 0
; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s12, v8
-; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc
+; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, s10, v10
+; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v11, vcc
; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6
-; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2
-; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4
; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10
+; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4
; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3
-; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10
-; GCN-IR-NEXT: v_mov_b32_e32 v11, v5
-; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
-; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT: v_mov_b32_e32 v10, v4
-; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12
+; GCN-IR-NEXT: v_and_b32_e32 v12, 0x8000, v12
+; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT: v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v10, v12
+; GCN-IR-NEXT: v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3
; GCN-IR-NEXT: ; %bb.4: ; %Flow
-; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %Flow4
; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT: .LBB9_5: ; %Flow4
+; GCN-IR-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989a2507..310f3ad04917b 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -508,12 +508,12 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v16
; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT: ; implicit-def: $vgpr0
; GFX942-NEXT: ; implicit-def: $vgpr12
; GFX942-NEXT: ; implicit-def: $vgpr10
; GFX942-NEXT: ; implicit-def: $vgpr13
@@ -521,74 +521,75 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: ; implicit-def: $vgpr11
; GFX942-NEXT: ; implicit-def: $vgpr15
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v2
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB10_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v1, s[10:11]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX942-NEXT: v_mov_b32_e32 v4, 8
; GFX942-NEXT: v_mov_b32_e32 v5, 7
; GFX942-NEXT: v_mov_b32_e32 v6, 6
-; GFX942-NEXT: v_mov_b32_e32 v1, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 5
; GFX942-NEXT: v_mov_b32_e32 v7, 4
; GFX942-NEXT: v_mov_b32_e32 v8, 3
; GFX942-NEXT: v_mov_b32_e32 v9, 2
-; GFX942-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-NEXT: v_mov_b32_e32 v2, 1
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v1
+; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v1
+; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v0
+; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v0
; GFX942-NEXT: .LBB10_2: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB10_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9
-; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7
-; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v9
+; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v7
+; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4
-; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6
-; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v6
+; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13]
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v12, v9
; GFX942-NEXT: v_mov_b32_e32 v10, v8
; GFX942-NEXT: v_mov_b32_e32 v13, v7
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
; GFX942-NEXT: v_mov_b32_e32 v14, v6
; GFX942-NEXT: v_mov_b32_e32 v11, v5
; GFX942-NEXT: v_mov_b32_e32 v15, v4
; GFX942-NEXT: .LBB10_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12
-; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13
-; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15
-; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14
-; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v12
+; GFX942-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v13
+; GFX942-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; GFX942-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v15
+; GFX942-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4212fd3b35cd8..b7c12854d1115 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1292,23 +1292,23 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_clause 0x1
; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX1064-NEXT: s_mov_b64 vcc, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11]
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB22_2
; GFX1064-NEXT: ; %bb.1: ; %bb
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
+; GFX1064-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_b64 vcc, vcc, exec
; GFX1064-NEXT: .LBB22_2: ; %exit
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3
diff --git a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
index fb9139c0d1285..389193f9d2407 100644
--- a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
+++ b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
@@ -112,20 +112,20 @@ define void @f(i32 %v, ptr noalias %outp) {
; V6M-NEXT: push {r4, lr}
; V6M-NEXT: movs r2, #0
; V6M-NEXT: str r2, [r1]
-; V6M-NEXT: movs r3, #14
-; V6M-NEXT: ands r3, r0
+; V6M-NEXT: movs r2, #14
+; V6M-NEXT: ands r2, r0
; V6M-NEXT: movs r4, #4
; V6M-NEXT: ands r4, r0
-; V6M-NEXT: movs r2, #2
-; V6M-NEXT: ands r2, r0
+; V6M-NEXT: movs r3, #2
+; V6M-NEXT: ands r3, r0
; V6M-NEXT: lsls r0, r0, #31
; V6M-NEXT: bne .LBB0_5
; V6M-NEXT: @ %bb.1: @ %if.then
; V6M-NEXT: movs r0, #129
-; V6M-NEXT: cmp r2, #0
+; V6M-NEXT: cmp r3, #0
; V6M-NEXT: beq .LBB0_3
; V6M-NEXT: @ %bb.2:
-; V6M-NEXT: lsls r2, r0, #8
+; V6M-NEXT: lsls r3, r0, #8
; V6M-NEXT: .LBB0_3: @ %if.then
; V6M-NEXT: cmp r4, #0
; V6M-NEXT: beq .LBB0_10
@@ -134,22 +134,22 @@ define void @f(i32 %v, ptr noalias %outp) {
; V6M-NEXT: b .LBB0_9
; V6M-NEXT: .LBB0_5: @ %if.else
; V6M-NEXT: movs r0, #129
-; V6M-NEXT: cmp r2, #0
+; V6M-NEXT: cmp r3, #0
; V6M-NEXT: beq .LBB0_7
; V6M-NEXT: @ %bb.6:
-; V6M-NEXT: lsls r2, r0, #6
+; V6M-NEXT: lsls r3, r0, #6
; V6M-NEXT: .LBB0_7: @ %if.else
; V6M-NEXT: cmp r4, #0
; V6M-NEXT: beq .LBB0_10
; V6M-NEXT: @ %bb.8: @ %if.else
; V6M-NEXT: lsls r0, r0, #5
; V6M-NEXT: .LBB0_9: @ %if.else
-; V6M-NEXT: adds r2, r2, r0
+; V6M-NEXT: adds r3, r3, r0
; V6M-NEXT: .LBB0_10: @ %if.else
-; V6M-NEXT: cmp r3, #0
+; V6M-NEXT: cmp r2, #0
; V6M-NEXT: beq .LBB0_12
; V6M-NEXT: @ %bb.11: @ %if.end
-; V6M-NEXT: str r2, [r1]
+; V6M-NEXT: str r3, [r1]
; V6M-NEXT: .LBB0_12: @ %exit
; V6M-NEXT: pop {r4, pc}
entry:
diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll
index 1146ad64ee709..e1bf4837d6a47 100644
--- a/llvm/test/CodeGen/ARM/cttz.ll
+++ b/llvm/test/CodeGen/ARM/cttz.ll
@@ -229,24 +229,24 @@ define i64 @test_i64(i64 %a) {
; CHECK-6M-NEXT: orrs r0, r3
; CHECK-6M-NEXT: beq .LBB3_6
; CHECK-6M-NEXT: @ %bb.1: @ %cond.false
-; CHECK-6M-NEXT: ldr r6, .LCPI3_0
+; CHECK-6M-NEXT: ldr r5, .LCPI3_0
; CHECK-6M-NEXT: adr r4, .LCPI3_1
; CHECK-6M-NEXT: movs r0, #32
; CHECK-6M-NEXT: cmp r3, #0
-; CHECK-6M-NEXT: mov r5, r0
+; CHECK-6M-NEXT: mov r6, r0
; CHECK-6M-NEXT: beq .LBB3_3
; CHECK-6M-NEXT: @ %bb.2: @ %cond.false
-; CHECK-6M-NEXT: rsbs r5, r3, #0
-; CHECK-6M-NEXT: ands r5, r3
-; CHECK-6M-NEXT: muls r5, r6, r5
-; CHECK-6M-NEXT: lsrs r3, r5, #27
-; CHECK-6M-NEXT: ldrb r5, [r4, r3]
+; CHECK-6M-NEXT: rsbs r6, r3, #0
+; CHECK-6M-NEXT: ands r6, r3
+; CHECK-6M-NEXT: muls r6, r5, r6
+; CHECK-6M-NEXT: lsrs r3, r6, #27
+; CHECK-6M-NEXT: ldrb r6, [r4, r3]
; CHECK-6M-NEXT: .LBB3_3: @ %cond.false
-; CHECK-6M-NEXT: adds r5, #32
+; CHECK-6M-NEXT: adds r6, #32
; CHECK-6M-NEXT: rsbs r3, r2, #0
; CHECK-6M-NEXT: ands r3, r2
-; CHECK-6M-NEXT: muls r6, r3, r6
-; CHECK-6M-NEXT: lsrs r3, r6, #27
+; CHECK-6M-NEXT: muls r5, r3, r5
+; CHECK-6M-NEXT: lsrs r3, r5, #27
; CHECK-6M-NEXT: cmp r2, #0
; CHECK-6M-NEXT: bne .LBB3_7
; CHECK-6M-NEXT: @ %bb.4: @ %cond.false
@@ -260,7 +260,7 @@ define i64 @test_i64(i64 %a) {
; CHECK-6M-NEXT: ldrb r0, [r4, r3]
; CHECK-6M-NEXT: bne .LBB3_5
; CHECK-6M-NEXT: .LBB3_8: @ %cond.false
-; CHECK-6M-NEXT: mov r0, r5
+; CHECK-6M-NEXT: mov r0, r6
; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
; CHECK-6M-NEXT: .p2align 2
; CHECK-6M-NEXT: @ %bb.9:
@@ -279,24 +279,24 @@ define i64 @test_i64(i64 %a) {
; CHECK-8MBASE-NEXT: orrs r0, r3
; CHECK-8MBASE-NEXT: beq .LBB3_6
; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false
-; CHECK-8MBASE-NEXT: movw r6, #46385
-; CHECK-8MBASE-NEXT: movt r6, #1916
+; CHECK-8MBASE-NEXT: movw r5, #46385
+; CHECK-8MBASE-NEXT: movt r5, #1916
; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0
; CHECK-8MBASE-NEXT: movs r0, #32
-; CHECK-8MBASE-NEXT: mov r5, r0
+; CHECK-8MBASE-NEXT: mov r6, r0
; CHECK-8MBASE-NEXT: cbz r3, .LBB3_3
; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false
-; CHECK-8MBASE-NEXT: rsbs r5, r3, #0
-; CHECK-8MBASE-NEXT: ands r5, r3
-; CHECK-8MBASE-NEXT: muls r5, r6, r5
-; CHECK-8MBASE-NEXT: lsrs r3, r5, #27
-; CHECK-8MBASE-NEXT: ldrb r5, [r4, r3]
+; CHECK-8MBASE-NEXT: rsbs r6, r3, #0
+; CHECK-8MBASE-NEXT: ands r6, r3
+; CHECK-8MBASE-NEXT: muls r6, r5, r6
+; CHECK-8MBASE-NEXT: lsrs r3, r6, #27
+; CHECK-8MBASE-NEXT: ldrb r6, [r4, r3]
; CHECK-8MBASE-NEXT: .LBB3_3: @ %cond.false
-; CHECK-8MBASE-NEXT: adds r5, #32
+; CHECK-8MBASE-NEXT: adds r6, #32
; CHECK-8MBASE-NEXT: rsbs r3, r2, #0
; CHECK-8MBASE-NEXT: ands r3, r2
-; CHECK-8MBASE-NEXT: muls r6, r3, r6
-; CHECK-8MBASE-NEXT: lsrs r3, r6, #27
+; CHECK-8MBASE-NEXT: muls r5, r3, r5
+; CHECK-8MBASE-NEXT: lsrs r3, r5, #27
; CHECK-8MBASE-NEXT: cmp r2, #0
; CHECK-8MBASE-NEXT: bne .LBB3_7
; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false
@@ -310,7 +310,7 @@ define i64 @test_i64(i64 %a) {
; CHECK-8MBASE-NEXT: ldrb r0, [r4, r3]
; CHECK-8MBASE-NEXT: bne .LBB3_5
; CHECK-8MBASE-NEXT: .LBB3_8: @ %cond.false
-; CHECK-8MBASE-NEXT: mov r0, r5
+; CHECK-8MBASE-NEXT: mov r0, r6
; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
; CHECK-8MBASE-NEXT: .p2align 2
; CHECK-8MBASE-NEXT: @ %bb.9:
@@ -503,24 +503,24 @@ define i64 @test_i64_zero_undef(i64 %a) {
; CHECK-6M-NEXT: .save {r4, r5, r7, lr}
; CHECK-6M-NEXT: push {r4, r5, r7, lr}
; CHECK-6M-NEXT: mov r2, r0
-; CHECK-6M-NEXT: ldr r5, .LCPI7_0
+; CHECK-6M-NEXT: ldr r4, .LCPI7_0
; CHECK-6M-NEXT: adr r3, .LCPI7_1
; CHECK-6M-NEXT: movs r0, #32
; CHECK-6M-NEXT: cmp r1, #0
-; CHECK-6M-NEXT: mov r4, r0
+; CHECK-6M-NEXT: mov r5, r0
; CHECK-6M-NEXT: beq .LBB7_2
; CHECK-6M-NEXT: @ %bb.1:
-; CHECK-6M-NEXT: rsbs r4, r1, #0
-; CHECK-6M-NEXT: ands r4, r1
-; CHECK-6M-NEXT: muls r4, r5, r4
-; CHECK-6M-NEXT: lsrs r1, r4, #27
-; CHECK-6M-NEXT: ldrb r4, [r3, r1]
+; CHECK-6M-NEXT: rsbs r5, r1, #0
+; CHECK-6M-NEXT: ands r5, r1
+; CHECK-6M-NEXT: muls r5, r4, r5
+; CHECK-6M-NEXT: lsrs r1, r5, #27
+; CHECK-6M-NEXT: ldrb r5, [r3, r1]
; CHECK-6M-NEXT: .LBB7_2:
-; CHECK-6M-NEXT: adds r4, #32
+; CHECK-6M-NEXT: adds r5, #32
; CHECK-6M-NEXT: rsbs r1, r2, #0
; CHECK-6M-NEXT: ands r1, r2
-; CHECK-6M-NEXT: muls r5, r1, r5
-; CHECK-6M-NEXT: lsrs r1, r5, #27
+; CHECK-6M-NEXT: muls r4, r1, r4
+; CHECK-6M-NEXT: lsrs r1, r4, #27
; CHECK-6M-NEXT: cmp r2, #0
; CHECK-6M-NEXT: bne .LBB7_5
; CHECK-6M-NEXT: @ %bb.3:
@@ -532,7 +532,7 @@ define i64 @test_i64_zero_undef(i64 %a) {
; CHECK-6M-NEXT: ldrb r0, [r3, r1]
; CHECK-6M-NEXT: bne .LBB7_4
; CHECK-6M-NEXT: .LBB7_6:
-; CHECK-6M-NEXT: mov r0, r4
+; CHECK-6M-NEXT: mov r0, r5
; CHECK-6M-NEXT: movs r1, #0
; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
; CHECK-6M-NEXT: .p2align 2
@@ -547,24 +547,24 @@ define i64 @test_i64_zero_undef(i64 %a) {
; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr}
; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr}
; CHECK-8MBASE-NEXT: mov r2, r0
-; CHECK-8MBASE-NEXT: movw r5, #46385
-; CHECK-8MBASE-NEXT: movt r5, #1916
+; CHECK-8MBASE-NEXT: movw r4, #46385
+; CHECK-8MBASE-NEXT: movt r4, #1916
; CHECK-8MBASE-NEXT: adr r3, .LCPI7_0
; CHECK-8MBASE-NEXT: movs r0, #32
-; CHECK-8MBASE-NEXT: mov r4, r0
+; CHECK-8MBASE-NEXT: mov r5, r0
; CHECK-8MBASE-NEXT: cbz r1, .LBB7_2
; CHECK-8MBASE-NEXT: @ %bb.1:
-; CHECK-8MBASE-NEXT: rsbs r4, r1, #0
-; CHECK-8MBASE-NEXT: ands r4, r1
-; CHECK-8MBASE-NEXT: muls r4, r5, r4
-; CHECK-8MBASE-NEXT: lsrs r1, r4, #27
-; CHECK-8MBASE-NEXT: ldrb r4, [r3, r1]
+; CHECK-8MBASE-NEXT: rsbs r5, r1, #0
+; CHECK-8MBASE-NEXT: ands r5, r1
+; CHECK-8MBASE-NEXT: muls r5, r4, r5
+; CHECK-8MBASE-NEXT: lsrs r1, r5, #27
+; CHECK-8MBASE-NEXT: ldrb r5, [r3, r1]
; CHECK-8MBASE-NEXT: .LBB7_2:
-; CHECK-8MBASE-NEXT: adds r4, #32
+; CHECK-8MBASE-NEXT: adds r5, #32
; CHECK-8MBASE-NEXT: rsbs r1, r2, #0
; CHECK-8MBASE-NEXT: ands r1, r2
-; CHECK-8MBASE-NEXT: muls r5, r1, r5
-; CHECK-8MBASE-NEXT: lsrs r1, r5, #27
+; CHECK-8MBASE-NEXT: muls r4, r1, r4
+; CHECK-8MBASE-NEXT: lsrs r1, r4, #27
; CHECK-8MBASE-NEXT: cmp r2, #0
; CHECK-8MBASE-NEXT: bne .LBB7_5
; CHECK-8MBASE-NEXT: @ %bb.3:
@@ -576,7 +576,7 @@ define i64 @test_i64_zero_undef(i64 %a) {
; CHECK-8MBASE-NEXT: ldrb r0, [r3, r1]
; CHECK-8MBASE-NEXT: bne .LBB7_4
; CHECK-8MBASE-NEXT: .LBB7_6:
-; CHECK-8MBASE-NEXT: mov r0, r4
+; CHECK-8MBASE-NEXT: mov r0, r5
; CHECK-8MBASE-NEXT: movs r1, #0
; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
; CHECK-8MBASE-NEXT: .p2align 2
diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll
index 186276b50ceeb..2bef1c83d7969 100644
--- a/llvm/test/CodeGen/ARM/select-imm.ll
+++ b/llvm/test/CodeGen/ARM/select-imm.ll
@@ -455,13 +455,13 @@ define void @t9(ptr %a, i8 %b) {
; ARMT2-NEXT: cmp r0, r0
; ARMT2-NEXT: popne {r4, pc}
; ARMT2-NEXT: .LBB8_1: @ %while.body.preheader
-; ARMT2-NEXT: add r1, r4, #1
-; ARMT2-NEXT: mov r2, r0
+; ARMT2-NEXT: mov r1, r0
+; ARMT2-NEXT: add r2, r4, #1
; ARMT2-NEXT: .LBB8_2: @ %while.body
; ARMT2-NEXT: @ =>This Inner Loop Header: Depth=1
-; ARMT2-NEXT: add r2, r2, #1
; ARMT2-NEXT: add r1, r1, #1
-; ARMT2-NEXT: uxtb r3, r2
+; ARMT2-NEXT: add r2, r2, #1
+; ARMT2-NEXT: uxtb r3, r1
; ARMT2-NEXT: cmp r3, r0
; ARMT2-NEXT: blt .LBB8_2
; ARMT2-NEXT: @ %bb.3: @ %while.end
@@ -503,13 +503,13 @@ define void @t9(ptr %a, i8 %b) {
; THUMB2-NEXT: it ne
; THUMB2-NEXT: popne {r4, pc}
; THUMB2-NEXT: .LBB8_1: @ %while.body.preheader
-; THUMB2-NEXT: adds r1, r4, #1
-; THUMB2-NEXT: mov r2, r0
+; THUMB2-NEXT: mov r1, r0
+; THUMB2-NEXT: adds r2, r4, #1
; THUMB2-NEXT: .LBB8_2: @ %while.body
; THUMB2-NEXT: @ =>This Inner Loop Header: Depth=1
-; THUMB2-NEXT: adds r2, #1
; THUMB2-NEXT: adds r1, #1
-; THUMB2-NEXT: uxtb r3, r2
+; THUMB2-NEXT: adds r2, #1
+; THUMB2-NEXT: uxtb r3, r1
; THUMB2-NEXT: cmp r3, r0
; THUMB2-NEXT: blt .LBB8_2
; THUMB2-NEXT: @ %bb.3: @ %while.end
diff --git a/llvm/test/CodeGen/ARM/struct-byval-loop.ll b/llvm/test/CodeGen/ARM/struct-byval-loop.ll
index 7a38dec2434f7..a90381acf4214 100644
--- a/llvm/test/CodeGen/ARM/struct-byval-loop.ll
+++ b/llvm/test/CodeGen/ARM/struct-byval-loop.ll
@@ -13,13 +13,13 @@ define void @test_80() {
; CHECK-NEXT: .pad #152
; CHECK-NEXT: sub sp, sp, #152
; CHECK-NEXT: add r0, sp, #72
-; CHECK-NEXT: ldr r1, .LCPI0_0
+; CHECK-NEXT: ldr r2, .LCPI0_0
; CHECK-NEXT: add r0, r0, #12
-; CHECK-NEXT: mov r2, sp
+; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: .LBB0_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r3, [r0], #4
-; CHECK-NEXT: subs r1, r1, #4
-; CHECK-NEXT: str r3, [r2], #4
+; CHECK-NEXT: subs r2, r2, #4
+; CHECK-NEXT: str r3, [r1], #4
; CHECK-NEXT: bne .LBB0_1
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: add r3, sp, #72
@@ -52,14 +52,14 @@ define void @test_4000() {
; CHECK-NEXT: sub sp, sp, #920
; CHECK-NEXT: sub sp, sp, #3072
; CHECK-NEXT: add lr, sp, #3072
-; CHECK-NEXT: ldr r1, .LCPI1_0
+; CHECK-NEXT: ldr r2, .LCPI1_0
; CHECK-NEXT: add r0, lr, #920
-; CHECK-NEXT: mov r2, sp
+; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: add r0, r0, #12
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r3, [r0], #4
-; CHECK-NEXT: subs r1, r1, #4
-; CHECK-NEXT: str r3, [r2], #4
+; CHECK-NEXT: subs r2, r2, #4
+; CHECK-NEXT: str r3, [r1], #4
; CHECK-NEXT: bne .LBB1_1
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: ldr r1, [sp, #3992]
diff --git a/llvm/test/CodeGen/ARM/swifterror.ll b/llvm/test/CodeGen/ARM/swifterror.ll
index f002c54fc60c0..259c20c8c9af6 100644
--- a/llvm/test/CodeGen/ARM/swifterror.ll
+++ b/llvm/test/CodeGen/ARM/swifterror.ll
@@ -79,17 +79,17 @@ define float @caller(ptr %error_ref) {
;
; CHECK-O0-LABEL: caller:
; CHECK-O0: @ %bb.0: @ %entry
-; CHECK-O0-NEXT: push {r7, r8, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #12
+; CHECK-O0-NEXT: push {r7, r8, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #12
; CHECK-O0-NEXT: @ implicit-def: $r1
-; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r8, #0
-; CHECK-O0-NEXT: bl _foo
-; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT: movw r0, #0
-; CHECK-O0-NEXT: cmp r8, r0
-; CHECK-O0-NEXT: bne LBB1_2
+; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r8, #0
+; CHECK-O0-NEXT: bl _foo
+; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: movw r0, #0
+; CHECK-O0-NEXT: cmp r8, r0
+; CHECK-O0-NEXT: bne LBB1_2
; CHECK-O0-NEXT: @ %bb.1: @ %cont
; CHECK-O0-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
@@ -100,7 +100,7 @@ define float @caller(ptr %error_ref) {
; CHECK-O0-NEXT: bl _free
; CHECK-O0-NEXT: mov r0, #1065353216
; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r8, pc}
+; CHECK-O0-NEXT: pop {r7, r8, pc}
;
; CHECK-ANDROID-LABEL: caller:
; CHECK-ANDROID: @ %bb.0: @ %entry
@@ -174,11 +174,11 @@ define float @caller2(ptr %error_ref) {
;
; CHECK-O0-LABEL: caller2:
; CHECK-O0: @ %bb.0: @ %entry
-; CHECK-O0-NEXT: push {r7, r8, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #16
+; CHECK-O0-NEXT: push {r7, r8, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #16
; CHECK-O0-NEXT: @ implicit-def: $r1
-; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill
; CHECK-O0-NEXT: LBB2_1: @ %bb_loop
; CHECK-O0-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-O0-NEXT: mov r8, #0
@@ -206,7 +206,7 @@ define float @caller2(ptr %error_ref) {
; CHECK-O0-NEXT: bl _free
; CHECK-O0-NEXT: mov r0, #1065353216
; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r8, pc}
+; CHECK-O0-NEXT: pop {r7, r8, pc}
;
; CHECK-ANDROID-LABEL: caller2:
; CHECK-ANDROID: @ %bb.0: @ %entry
@@ -400,35 +400,35 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
; CHECK-O0-NEXT: mov r7, sp
; CHECK-O0-NEXT: sub sp, sp, #20
; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT: str r8, [r7, #-8] @ 4-byte Spill
; CHECK-O0-NEXT: vmov s0, r1
-; CHECK-O0-NEXT: vstr s0, [r7, #-8] @ 4-byte Spill
-; CHECK-O0-NEXT: str r8, [r7, #-4] @ 4-byte Spill
+; CHECK-O0-NEXT: vstr s0, [r7, #-4] @ 4-byte Spill
; CHECK-O0-NEXT: b LBB4_1
; CHECK-O0-NEXT: LBB4_1: @ %bb_loop
; CHECK-O0-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r0, [r7, #-4] @ 4-byte Reload
-; CHECK-O0-NEXT: cmp r1, #0
-; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r1, [r7, #-8] @ 4-byte Reload
+; CHECK-O0-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: cmp r0, #0
; CHECK-O0-NEXT: beq LBB4_3
; CHECK-O0-NEXT: @ %bb.2: @ %gen_error
; CHECK-O0-NEXT: @ in Loop: Header=BB4_1 Depth=1
; CHECK-O0-NEXT: mov r0, #16
; CHECK-O0-NEXT: mov r1, #0
; CHECK-O0-NEXT: bl _malloc
-; CHECK-O0-NEXT: mov r2, r0
-; CHECK-O0-NEXT: movw r1, #1
-; CHECK-O0-NEXT: strb r1, [r2, #8]
+; CHECK-O0-NEXT: mov r1, r0
; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: movw r0, #1
+; CHECK-O0-NEXT: strb r0, [r1, #8]
; CHECK-O0-NEXT: LBB4_3: @ %bb_cont
; CHECK-O0-NEXT: @ in Loop: Header=BB4_1 Depth=1
-; CHECK-O0-NEXT: vldr s0, [r7, #-8] @ 4-byte Reload
+; CHECK-O0-NEXT: vldr s0, [r7, #-4] @ 4-byte Reload
; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-O0-NEXT: vmov.f32 s2, #1.000000e+00
; CHECK-O0-NEXT: vcmp.f32 s0, s2
; CHECK-O0-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-O0-NEXT: str r0, [r7, #-4] @ 4-byte Spill
+; CHECK-O0-NEXT: str r0, [r7, #-8] @ 4-byte Spill
; CHECK-O0-NEXT: ble LBB4_1
; CHECK-O0-NEXT: @ %bb.4: @ %bb_end
; CHECK-O0-NEXT: ldr r8, [sp] @ 4-byte Reload
@@ -581,20 +581,20 @@ define float @caller3(ptr %error_ref) {
;
; CHECK-O0-LABEL: caller3:
; CHECK-O0: @ %bb.0: @ %entry
-; CHECK-O0-NEXT: push {r7, r8, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #44
-; CHECK-O0-NEXT: bfc sp, #0, #3
+; CHECK-O0-NEXT: push {r7, r8, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #44
+; CHECK-O0-NEXT: bfc sp, #0, #3
; CHECK-O0-NEXT: @ implicit-def: $r1
-; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r8, #0
-; CHECK-O0-NEXT: add r0, sp, #16
-; CHECK-O0-NEXT: mov r1, #1
-; CHECK-O0-NEXT: bl _foo_sret
-; CHECK-O0-NEXT: str r8, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT: movw r0, #0
-; CHECK-O0-NEXT: cmp r8, r0
-; CHECK-O0-NEXT: bne LBB6_2
+; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r8, #0
+; CHECK-O0-NEXT: add r0, sp, #16
+; CHECK-O0-NEXT: mov r1, #1
+; CHECK-O0-NEXT: bl _foo_sret
+; CHECK-O0-NEXT: str r8, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT: movw r0, #0
+; CHECK-O0-NEXT: cmp r8, r0
+; CHECK-O0-NEXT: bne LBB6_2
; CHECK-O0-NEXT: @ %bb.1: @ %cont
; CHECK-O0-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
@@ -605,7 +605,7 @@ define float @caller3(ptr %error_ref) {
; CHECK-O0-NEXT: bl _free
; CHECK-O0-NEXT: mov r0, #1065353216
; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r8, pc}
+; CHECK-O0-NEXT: pop {r7, r8, pc}
;
; CHECK-ANDROID-LABEL: caller3:
; CHECK-ANDROID: @ %bb.0: @ %entry
@@ -803,26 +803,26 @@ define float @caller4(ptr %error_ref) {
;
; CHECK-O0-LABEL: caller4:
; CHECK-O0: @ %bb.0: @ %entry
-; CHECK-O0-NEXT: push {r7, r8, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #24
+; CHECK-O0-NEXT: push {r7, r8, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #24
; CHECK-O0-NEXT: @ implicit-def: $r1
-; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r8, #0
-; CHECK-O0-NEXT: mov r0, #10
-; CHECK-O0-NEXT: str r0, [r7, #-12]
-; CHECK-O0-NEXT: mov r0, #11
-; CHECK-O0-NEXT: str r0, [sp, #12]
-; CHECK-O0-NEXT: mov r0, #12
-; CHECK-O0-NEXT: str r0, [sp, #8]
-; CHECK-O0-NEXT: ldr r0, [r7, #-12]
-; CHECK-O0-NEXT: ldr r1, [sp, #12]
-; CHECK-O0-NEXT: ldr r2, [sp, #8]
-; CHECK-O0-NEXT: bl _foo_vararg
-; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT: movw r0, #0
-; CHECK-O0-NEXT: cmp r8, r0
-; CHECK-O0-NEXT: bne LBB8_2
+; CHECK-O0-NEXT: str r0, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r8, #0
+; CHECK-O0-NEXT: mov r0, #10
+; CHECK-O0-NEXT: str r0, [r7, #-12]
+; CHECK-O0-NEXT: mov r0, #11
+; CHECK-O0-NEXT: str r0, [sp, #12]
+; CHECK-O0-NEXT: mov r0, #12
+; CHECK-O0-NEXT: str r0, [sp, #8]
+; CHECK-O0-NEXT: ldr r0, [r7, #-12]
+; CHECK-O0-NEXT: ldr r1, [sp, #12]
+; CHECK-O0-NEXT: ldr r2, [sp, #8]
+; CHECK-O0-NEXT: bl _foo_vararg
+; CHECK-O0-NEXT: str r8, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: movw r0, #0
+; CHECK-O0-NEXT: cmp r8, r0
+; CHECK-O0-NEXT: bne LBB8_2
; CHECK-O0-NEXT: @ %bb.1: @ %cont
; CHECK-O0-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
@@ -833,7 +833,7 @@ define float @caller4(ptr %error_ref) {
; CHECK-O0-NEXT: bl _free
; CHECK-O0-NEXT: mov r0, #1065353216
; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r8, pc}
+; CHECK-O0-NEXT: pop {r7, r8, pc}
;
; CHECK-ANDROID-LABEL: caller4:
; CHECK-ANDROID: @ %bb.0: @ %entry
@@ -987,12 +987,12 @@ define swiftcc void @swifterror_reg_clobber(ptr nocapture %err) {
;
; CHECK-O0-LABEL: swifterror_reg_clobber:
; CHECK-O0: @ %bb.0:
-; CHECK-O0-NEXT: push {r7, r8, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: push {r7, r8, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
; CHECK-O0-NEXT: @ InlineAsm Start
; CHECK-O0-NEXT: nop
; CHECK-O0-NEXT: @ InlineAsm End
-; CHECK-O0-NEXT: pop {r7, r8, pc}
+; CHECK-O0-NEXT: pop {r7, r8, pc}
;
; CHECK-ANDROID-LABEL: swifterror_reg_clobber:
; CHECK-ANDROID: @ %bb.0:
@@ -1038,34 +1038,34 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, ptr swiftself, ptr nocapt
;
; CHECK-O0-LABEL: params_in_reg:
; CHECK-O0: @ %bb.0:
-; CHECK-O0-NEXT: push {r7, r10, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #28
-; CHECK-O0-NEXT: bfc sp, #0, #3
-; CHECK-O0-NEXT: str r8, [sp, #20] @ 4-byte Spill
-; CHECK-O0-NEXT: str r10, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT: str r3, [sp, #16] @ 4-byte Spill
-; CHECK-O0-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; CHECK-O0-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: push {r7, r10, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #28
+; CHECK-O0-NEXT: bfc sp, #0, #3
+; CHECK-O0-NEXT: str r8, [sp, #20] @ 4-byte Spill
+; CHECK-O0-NEXT: str r10, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT: str r3, [sp, #16] @ 4-byte Spill
+; CHECK-O0-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; CHECK-O0-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-O0-NEXT: @ implicit-def: $r0
-; CHECK-O0-NEXT: mov r8, #0
-; CHECK-O0-NEXT: mov r0, #1
-; CHECK-O0-NEXT: mov r1, #2
-; CHECK-O0-NEXT: mov r2, #3
-; CHECK-O0-NEXT: mov r3, #4
-; CHECK-O0-NEXT: mov r10, r8
-; CHECK-O0-NEXT: bl _params_in_reg2
-; CHECK-O0-NEXT: ldr r10, [sp] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r3, [sp, #16] @ 4-byte Reload
-; CHECK-O0-NEXT: mov r9, r8
-; CHECK-O0-NEXT: ldr r8, [sp, #20] @ 4-byte Reload
-; CHECK-O0-NEXT: bl _params_in_reg2
-; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r10, pc}
+; CHECK-O0-NEXT: mov r8, #0
+; CHECK-O0-NEXT: mov r0, #1
+; CHECK-O0-NEXT: mov r1, #2
+; CHECK-O0-NEXT: mov r2, #3
+; CHECK-O0-NEXT: mov r3, #4
+; CHECK-O0-NEXT: mov r10, r8
+; CHECK-O0-NEXT: bl _params_in_reg2
+; CHECK-O0-NEXT: ldr r10, [sp] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r3, [sp, #16] @ 4-byte Reload
+; CHECK-O0-NEXT: mov r9, r8
+; CHECK-O0-NEXT: ldr r8, [sp, #20] @ 4-byte Reload
+; CHECK-O0-NEXT: bl _params_in_reg2
+; CHECK-O0-NEXT: sub sp, r7, #4
+; CHECK-O0-NEXT: pop {r7, r10, pc}
;
; CHECK-ANDROID-LABEL: params_in_reg:
; CHECK-ANDROID: @ %bb.0:
@@ -1153,63 +1153,63 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3
;
; CHECK-O0-LABEL: params_and_return_in_reg:
; CHECK-O0: @ %bb.0:
-; CHECK-O0-NEXT: push {r7, r10, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #76
-; CHECK-O0-NEXT: bfc sp, #0, #3
-; CHECK-O0-NEXT: str r8, [sp, #24] @ 4-byte Spill
-; CHECK-O0-NEXT: str r10, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT: str r3, [sp, #20] @ 4-byte Spill
-; CHECK-O0-NEXT: str r2, [sp, #16] @ 4-byte Spill
-; CHECK-O0-NEXT: str r1, [sp, #12] @ 4-byte Spill
-; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT: push {r7, r10, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #76
+; CHECK-O0-NEXT: bfc sp, #0, #3
+; CHECK-O0-NEXT: str r8, [sp, #24] @ 4-byte Spill
+; CHECK-O0-NEXT: str r10, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT: str r3, [sp, #20] @ 4-byte Spill
+; CHECK-O0-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; CHECK-O0-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-O0-NEXT: str r0, [sp, #8] @ 4-byte Spill
; CHECK-O0-NEXT: @ implicit-def: $r0
-; CHECK-O0-NEXT: mov r8, #0
-; CHECK-O0-NEXT: str r8, [sp, #28] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r0, #1
-; CHECK-O0-NEXT: str r0, [sp, #32] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r1, #2
-; CHECK-O0-NEXT: str r1, [sp, #36] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r2, #3
-; CHECK-O0-NEXT: str r2, [sp, #40] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r3, #4
-; CHECK-O0-NEXT: str r3, [sp, #44] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r10, r8
-; CHECK-O0-NEXT: bl _params_in_reg2
-; CHECK-O0-NEXT: ldr r10, [sp, #4] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-O0-NEXT: mov r9, r8
-; CHECK-O0-NEXT: ldr r8, [sp, #24] @ 4-byte Reload
-; CHECK-O0-NEXT: str r9, [sp, #48] @ 4-byte Spill
-; CHECK-O0-NEXT: bl _params_and_return_in_reg2
-; CHECK-O0-NEXT: ldr r10, [sp, #28] @ 4-byte Reload
-; CHECK-O0-NEXT: mov r9, r0
-; CHECK-O0-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
-; CHECK-O0-NEXT: str r9, [sp, #52] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r9, r1
-; CHECK-O0-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-O0-NEXT: str r9, [sp, #56] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r9, r2
-; CHECK-O0-NEXT: ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-O0-NEXT: str r9, [sp, #60] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r9, r3
-; CHECK-O0-NEXT: ldr r3, [sp, #44] @ 4-byte Reload
-; CHECK-O0-NEXT: str r9, [sp, #64] @ 4-byte Spill
-; CHECK-O0-NEXT: mov r9, r8
-; CHECK-O0-NEXT: ldr r8, [sp, #48] @ 4-byte Reload
-; CHECK-O0-NEXT: str r9, [sp, #68] @ 4-byte Spill
-; CHECK-O0-NEXT: bl _params_in_reg2
-; CHECK-O0-NEXT: ldr r0, [sp, #52] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r1, [sp, #56] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r2, [sp, #60] @ 4-byte Reload
-; CHECK-O0-NEXT: ldr r3, [sp, #64] @ 4-byte Reload
-; CHECK-O0-NEXT: mov r9, r8
-; CHECK-O0-NEXT: ldr r8, [sp, #68] @ 4-byte Reload
-; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r10, pc}
+; CHECK-O0-NEXT: mov r8, #0
+; CHECK-O0-NEXT: str r8, [sp, #28] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r0, #1
+; CHECK-O0-NEXT: str r0, [sp, #32] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r1, #2
+; CHECK-O0-NEXT: str r1, [sp, #36] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r2, #3
+; CHECK-O0-NEXT: str r2, [sp, #40] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r3, #4
+; CHECK-O0-NEXT: str r3, [sp, #44] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r10, r8
+; CHECK-O0-NEXT: bl _params_in_reg2
+; CHECK-O0-NEXT: ldr r10, [sp, #4] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-O0-NEXT: mov r9, r8
+; CHECK-O0-NEXT: ldr r8, [sp, #24] @ 4-byte Reload
+; CHECK-O0-NEXT: str r9, [sp, #48] @ 4-byte Spill
+; CHECK-O0-NEXT: bl _params_and_return_in_reg2
+; CHECK-O0-NEXT: ldr r10, [sp, #28] @ 4-byte Reload
+; CHECK-O0-NEXT: mov r9, r0
+; CHECK-O0-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-O0-NEXT: str r9, [sp, #52] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r9, r1
+; CHECK-O0-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-O0-NEXT: str r9, [sp, #56] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r9, r2
+; CHECK-O0-NEXT: ldr r2, [sp, #40] @ 4-byte Reload
+; CHECK-O0-NEXT: str r9, [sp, #60] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r9, r3
+; CHECK-O0-NEXT: ldr r3, [sp, #44] @ 4-byte Reload
+; CHECK-O0-NEXT: str r9, [sp, #64] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r9, r8
+; CHECK-O0-NEXT: ldr r8, [sp, #48] @ 4-byte Reload
+; CHECK-O0-NEXT: str r9, [sp, #68] @ 4-byte Spill
+; CHECK-O0-NEXT: bl _params_in_reg2
+; CHECK-O0-NEXT: ldr r0, [sp, #52] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r1, [sp, #56] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r2, [sp, #60] @ 4-byte Reload
+; CHECK-O0-NEXT: ldr r3, [sp, #64] @ 4-byte Reload
+; CHECK-O0-NEXT: mov r9, r8
+; CHECK-O0-NEXT: ldr r8, [sp, #68] @ 4-byte Reload
+; CHECK-O0-NEXT: sub sp, r7, #4
+; CHECK-O0-NEXT: pop {r7, r10, pc}
;
; CHECK-ANDROID-LABEL: params_and_return_in_reg:
; CHECK-ANDROID: @ %bb.0:
@@ -1325,17 +1325,17 @@ define swiftcc ptr @testAssign(ptr %error_ref) {
;
; CHECK-O0-LABEL: testAssign:
; CHECK-O0: @ %bb.0: @ %entry
-; CHECK-O0-NEXT: push {r7, r8, lr}
-; CHECK-O0-NEXT: add r7, sp, #4
-; CHECK-O0-NEXT: sub sp, sp, #8
+; CHECK-O0-NEXT: push {r7, r8, lr}
+; CHECK-O0-NEXT: add r7, sp, #4
+; CHECK-O0-NEXT: sub sp, sp, #8
; CHECK-O0-NEXT: @ implicit-def: $r1
-; CHECK-O0-NEXT: mov r8, #0
-; CHECK-O0-NEXT: bl _foo2
-; CHECK-O0-NEXT: str r8, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT: mov r8, #0
+; CHECK-O0-NEXT: bl _foo2
+; CHECK-O0-NEXT: str r8, [sp] @ 4-byte Spill
; CHECK-O0-NEXT: @ %bb.1: @ %a
; CHECK-O0-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-O0-NEXT: sub sp, r7, #4
-; CHECK-O0-NEXT: pop {r7, r8, pc}
+; CHECK-O0-NEXT: pop {r7, r8, pc}
;
; CHECK-ANDROID-LABEL: testAssign:
; CHECK-ANDROID: @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll
index 2a22666a1ff92..d3436e2da1d3d 100644
--- a/llvm/test/CodeGen/AVR/bug-81911.ll
+++ b/llvm/test/CodeGen/AVR/bug-81911.ll
@@ -41,31 +41,31 @@ define internal i8 @main() {
; CHECK-NEXT: adiw r24, 6
; CHECK-NEXT: std Y+3, r25 ; 2-byte Folded Spill
; CHECK-NEXT: std Y+2, r24 ; 2-byte Folded Spill
-; CHECK-NEXT: movw r8, r16
-; CHECK-NEXT: movw r6, r16
-; CHECK-NEXT: movw r4, r16
; CHECK-NEXT: movw r2, r16
+; CHECK-NEXT: movw r4, r16
+; CHECK-NEXT: movw r6, r16
+; CHECK-NEXT: movw r8, r16
; CHECK-NEXT: rjmp .LBB0_2
; CHECK-NEXT: .LBB0_1: ; %bb1
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: andi r30, 1
; CHECK-NEXT: ldd r31, Y+4 ; 1-byte Folded Reload
; CHECK-NEXT: dec r31
+; CHECK-NEXT: movw r8, r24
+; CHECK-NEXT: movw r6, r22
+; CHECK-NEXT: movw r4, r20
+; CHECK-NEXT: movw r2, r18
; CHECK-NEXT: cpi r30, 0
-; CHECK-NEXT: movw r8, r18
-; CHECK-NEXT: movw r6, r20
-; CHECK-NEXT: movw r4, r22
-; CHECK-NEXT: movw r2, r24
; CHECK-NEXT: mov r18, r31
; CHECK-NEXT: brne .LBB0_2
; CHECK-NEXT: rjmp .LBB0_4
; CHECK-NEXT: .LBB0_2: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: std Y+4, r18 ; 1-byte Folded Spill
-; CHECK-NEXT: movw r18, r8
-; CHECK-NEXT: movw r20, r6
-; CHECK-NEXT: movw r22, r4
-; CHECK-NEXT: movw r24, r2
+; CHECK-NEXT: movw r18, r2
+; CHECK-NEXT: movw r20, r4
+; CHECK-NEXT: movw r22, r6
+; CHECK-NEXT: movw r24, r8
; CHECK-NEXT: ldi r26, 10
; CHECK-NEXT: ldi r27, 0
; CHECK-NEXT: movw r10, r26
@@ -85,14 +85,14 @@ define internal i8 @main() {
; CHECK-NEXT: ;APP
; CHECK-NEXT: ;NO_APP
; CHECK-NEXT: ldi r30, 1
-; CHECK-NEXT: cp r8, r1
-; CHECK-NEXT: cpc r9, r1
-; CHECK-NEXT: cpc r6, r16
-; CHECK-NEXT: cpc r7, r17
+; CHECK-NEXT: cp r2, r1
+; CHECK-NEXT: cpc r3, r1
; CHECK-NEXT: cpc r4, r16
; CHECK-NEXT: cpc r5, r17
-; CHECK-NEXT: cpc r2, r16
-; CHECK-NEXT: cpc r3, r17
+; CHECK-NEXT: cpc r6, r16
+; CHECK-NEXT: cpc r7, r17
+; CHECK-NEXT: cpc r8, r16
+; CHECK-NEXT: cpc r9, r17
; CHECK-NEXT: breq .LBB0_3
; CHECK-NEXT: rjmp .LBB0_1
; CHECK-NEXT: .LBB0_3: ; %bb1
diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
index 006a8b6bfc94a..4d75a733526b0 100644
--- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; This version of the conv3x3 test has both loops. This test checks that the
-; inner loop has 14 packets.
+; inner loop has 13 packets.
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
@@ -17,7 +17,6 @@
; CHECK: }
; CHECK: }
; CHECK: }
-; CHECK: }
; CHECK-NOT: }
; CHECK: }{{[ \t]*}}:endloop0
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
index 96a38939dc50e..b50290525002d 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
@@ -12,8 +12,8 @@
; CHECK: [[EPLOG]]:
; CHECK: [[VREG1:v([0-9]+)]] = [[VREG]]
; CHECK: [[EPLOG1]]:
-; CHECK: [[VREG2:v[0-9]+]] = [[VREG1]]
-; CHECK: = vlalign([[VREG1]],[[VREG2]],#1)
+; CHECK: [[VREG2:v[0-9]+]] = [[VREG]]
+; CHECK: = vlalign([[VREG2]],[[VREG1]],#1)
; Function Attrs: nounwind
define void @f0(ptr noalias nocapture readonly %a0, i32 %a1, i32 %a2, ptr noalias nocapture readonly %a3, i32 %a4, ptr noalias nocapture %a5, i32 %a6) #0 {
diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
index 42efe60b96d48..c4dbbcc5969ca 100644
--- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -3,7 +3,7 @@
; From coremark. Test that we pipeline the matrix multiplication bitextract
; function. The pipelined code should have two packets.
-; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: loop0(.LBB0_[[LOOP:[0-9]+]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: [[REG0:(r[0-9]+)]] = mpyi([[REG1:(r[0-9]+)]],[[REG2:(r[0-9]+)]])
; CHECK: += mpyi
diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
index 0d029dc7d2f2e..bddf9cebe7160 100644
--- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
@@ -3,11 +3,8 @@
; Test that we rename registers correctly for multiple stages when there is a
; Phi and depends upon another Phi.
-; CHECK: = and
-; CHECK: = and
-; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
-; CHECK: = and
-; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
+; CHECK: jump
+; CHECK-NEXT: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
diff --git a/llvm/test/CodeGen/Hexagon/tinycore.ll b/llvm/test/CodeGen/Hexagon/tinycore.ll
index c44038e767194..b20a7831df4d8 100644
--- a/llvm/test/CodeGen/Hexagon/tinycore.ll
+++ b/llvm/test/CodeGen/Hexagon/tinycore.ll
@@ -8,10 +8,15 @@
; CHECK: .LBB0_[[LOOP]]:
; CHECK: {
; CHECK-NEXT: mpy
-; CHECK-NEXT: combine
-; CHECK-NEXT: memw
-; CHECK-NEXT: }
+; CHECK-NOT: memw
+; CHECK: }
+; CHECK: {
+; CHECK: memw
+; CHECK-NOT: memw
+; CHECK: }
+; CHECK: {
; CHECK: memw
+; CHECK-NOT: memw
; CHECK: } :endloop0
; Test the loop contains a single packet with 4 instructions.
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index 9142e718e8adc..06edb736e0435 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -39,17 +39,17 @@ define void @test_la_pcrel(i32 signext %n) {
;
; LA64LARGE-LABEL: test_la_pcrel:
; LA64LARGE: # %bb.0: # %entry
-; LA64LARGE-NEXT: pcalau12i $a1, %pc_hi20(l)
-; LA64LARGE-NEXT: addi.d $a2, $zero, %pc_lo12(l)
-; LA64LARGE-NEXT: lu32i.d $a2, %pc64_lo20(l)
-; LA64LARGE-NEXT: lu52i.d $a2, $a2, %pc64_hi12(l)
-; LA64LARGE-NEXT: move $a3, $zero
+; LA64LARGE-NEXT: move $a1, $zero
+; LA64LARGE-NEXT: pcalau12i $a2, %pc_hi20(l)
+; LA64LARGE-NEXT: addi.d $a3, $zero, %pc_lo12(l)
+; LA64LARGE-NEXT: lu32i.d $a3, %pc64_lo20(l)
+; LA64LARGE-NEXT: lu52i.d $a3, $a3, %pc64_hi12(l)
; LA64LARGE-NEXT: .p2align 4, , 16
; LA64LARGE-NEXT: .LBB0_1: # %loop
; LA64LARGE-NEXT: # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT: ldx.w $zero, $a2, $a1
-; LA64LARGE-NEXT: addi.w $a3, $a3, 1
-; LA64LARGE-NEXT: blt $a3, $a0, .LBB0_1
+; LA64LARGE-NEXT: ldx.w $zero, $a3, $a2
+; LA64LARGE-NEXT: addi.w $a1, $a1, 1
+; LA64LARGE-NEXT: blt $a1, $a0, .LBB0_1
; LA64LARGE-NEXT: # %bb.2: # %ret
; LA64LARGE-NEXT: ret
entry:
@@ -99,18 +99,18 @@ define void @test_la_got(i32 signext %n) {
;
; LA64LARGE-LABEL: test_la_got:
; LA64LARGE: # %bb.0: # %entry
-; LA64LARGE-NEXT: pcalau12i $a1, %got_pc_hi20(g)
-; LA64LARGE-NEXT: addi.d $a2, $zero, %got_pc_lo12(g)
-; LA64LARGE-NEXT: lu32i.d $a2, %got64_pc_lo20(g)
-; LA64LARGE-NEXT: lu52i.d $a2, $a2, %got64_pc_hi12(g)
-; LA64LARGE-NEXT: ldx.d $a1, $a2, $a1
-; LA64LARGE-NEXT: move $a2, $zero
+; LA64LARGE-NEXT: move $a1, $zero
+; LA64LARGE-NEXT: pcalau12i $a2, %got_pc_hi20(g)
+; LA64LARGE-NEXT: addi.d $a3, $zero, %got_pc_lo12(g)
+; LA64LARGE-NEXT: lu32i.d $a3, %got64_pc_lo20(g)
+; LA64LARGE-NEXT: lu52i.d $a3, $a3, %got64_pc_hi12(g)
+; LA64LARGE-NEXT: ldx.d $a2, $a3, $a2
; LA64LARGE-NEXT: .p2align 4, , 16
; LA64LARGE-NEXT: .LBB1_1: # %loop
; LA64LARGE-NEXT: # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT: ld.w $zero, $a1, 0
-; LA64LARGE-NEXT: addi.w $a2, $a2, 1
-; LA64LARGE-NEXT: blt $a2, $a0, .LBB1_1
+; LA64LARGE-NEXT: ld.w $zero, $a2, 0
+; LA64LARGE-NEXT: addi.w $a1, $a1, 1
+; LA64LARGE-NEXT: blt $a1, $a0, .LBB1_1
; LA64LARGE-NEXT: # %bb.2: # %ret
; LA64LARGE-NEXT: ret
entry:
@@ -161,18 +161,18 @@ define void @test_la_tls_ie(i32 signext %n) {
;
; LA64LARGE-LABEL: test_la_tls_ie:
; LA64LARGE: # %bb.0: # %entry
-; LA64LARGE-NEXT: pcalau12i $a1, %ie_pc_hi20(ie)
-; LA64LARGE-NEXT: addi.d $a2, $zero, %ie_pc_lo12(ie)
-; LA64LARGE-NEXT: lu32i.d $a2, %ie64_pc_lo20(ie)
-; LA64LARGE-NEXT: lu52i.d $a2, $a2, %ie64_pc_hi12(ie)
-; LA64LARGE-NEXT: ldx.d $a1, $a2, $a1
-; LA64LARGE-NEXT: move $a2, $zero
+; LA64LARGE-NEXT: move $a1, $zero
+; LA64LARGE-NEXT: pcalau12i $a2, %ie_pc_hi20(ie)
+; LA64LARGE-NEXT: addi.d $a3, $zero, %ie_pc_lo12(ie)
+; LA64LARGE-NEXT: lu32i.d $a3, %ie64_pc_lo20(ie)
+; LA64LARGE-NEXT: lu52i.d $a3, $a3, %ie64_pc_hi12(ie)
+; LA64LARGE-NEXT: ldx.d $a2, $a3, $a2
; LA64LARGE-NEXT: .p2align 4, , 16
; LA64LARGE-NEXT: .LBB2_1: # %loop
; LA64LARGE-NEXT: # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT: ldx.w $zero, $a1, $tp
-; LA64LARGE-NEXT: addi.w $a2, $a2, 1
-; LA64LARGE-NEXT: blt $a2, $a0, .LBB2_1
+; LA64LARGE-NEXT: ldx.w $zero, $a2, $tp
+; LA64LARGE-NEXT: addi.w $a1, $a1, 1
+; LA64LARGE-NEXT: blt $a1, $a0, .LBB2_1
; LA64LARGE-NEXT: # %bb.2: # %ret
; LA64LARGE-NEXT: ret
entry:
@@ -270,11 +270,11 @@ define void @test_la_tls_ld(i32 signext %n) {
; LA64LARGE-NEXT: .cfi_offset 23, -24
; LA64LARGE-NEXT: .cfi_offset 24, -32
; LA64LARGE-NEXT: move $fp, $a0
+; LA64LARGE-NEXT: move $s1, $zero
; LA64LARGE-NEXT: pcalau12i $a0, %ld_pc_hi20(ld)
; LA64LARGE-NEXT: addi.d $a1, $zero, %got_pc_lo12(ld)
; LA64LARGE-NEXT: lu32i.d $a1, %got64_pc_lo20(ld)
; LA64LARGE-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(ld)
-; LA64LARGE-NEXT: move $s1, $zero
; LA64LARGE-NEXT: add.d $s0, $a1, $a0
; LA64LARGE-NEXT: .p2align 4, , 16
; LA64LARGE-NEXT: .LBB3_1: # %loop
@@ -436,11 +436,11 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
; LA64LARGE-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill
; LA64LARGE-NEXT: st.d $s1, $sp, 0 # 8-byte Folded Spill
; LA64LARGE-NEXT: move $fp, $a0
+; LA64LARGE-NEXT: move $s1, $zero
; LA64LARGE-NEXT: pcalau12i $a0, %gd_pc_hi20(gd)
; LA64LARGE-NEXT: addi.d $a1, $zero, %got_pc_lo12(gd)
; LA64LARGE-NEXT: lu32i.d $a1, %got64_pc_lo20(gd)
; LA64LARGE-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(gd)
-; LA64LARGE-NEXT: move $s1, $zero
; LA64LARGE-NEXT: add.d $s0, $a1, $a0
; LA64LARGE-NEXT: .p2align 4, , 16
; LA64LARGE-NEXT: .LBB5_1: # %loop
diff --git a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
index 8bdf719f4bb5b..59f1477b5c37f 100644
--- a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
+++ b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
@@ -80,10 +80,9 @@ body: |
# CHECK-NEXT: %15:g8rc = COPY killed %6
# CHECK: bb.3:
# CHECK: %10:g8rc = COPY killed %15
-# CHECK-NEXT: %9:g8rc = COPY killed %14
+# CHECK-NEXT: %16:g8rc_and_g8rc_nox0 = COPY killed %14
# CHECK-NEXT: %14:g8rc = COPY killed %10
# CHECK-NEXT: %15:g8rc = IMPLICIT_DEF
-# CHECK-NEXT: %16:g8rc_and_g8rc_nox0 = COPY killed %9
# CHECK-NEXT: BCC 68, %7, %bb.3
# CHECK-NEXT: B %bb.4
# CHECK: bb.4:
diff --git a/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll b/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
index cd5ea16d4600b..cd2fbdfe71263 100644
--- a/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
+++ b/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
@@ -51,9 +51,9 @@ define ppc_fp128 @test_ctr0() {
; P9BE-NEXT: .cfi_offset r30, -16
; P9BE-NEXT: li r3, 1
; P9BE-NEXT: std r30, 112(r1) # 8-byte Folded Spill
-; P9BE-NEXT: xxlxor f1, f1, f1
-; P9BE-NEXT: rldic r30, r3, 62, 1
; P9BE-NEXT: xxlxor f2, f2, f2
+; P9BE-NEXT: rldic r30, r3, 62, 1
+; P9BE-NEXT: xxlxor f1, f1, f1
; P9BE-NEXT: .p2align 5
; P9BE-NEXT: .LBB0_1: # %bb6
; P9BE-NEXT: #
@@ -111,8 +111,8 @@ define ppc_fp128 @test_ctr0() {
; P8BE-NEXT: .cfi_offset r30, -16
; P8BE-NEXT: li r3, 1
; P8BE-NEXT: std r30, 112(r1) # 8-byte Folded Spill
-; P8BE-NEXT: xxlxor f1, f1, f1
; P8BE-NEXT: xxlxor f2, f2, f2
+; P8BE-NEXT: xxlxor f1, f1, f1
; P8BE-NEXT: rldic r30, r3, 62, 1
; P8BE-NEXT: .p2align 5
; P8BE-NEXT: .LBB0_1: # %bb6
diff --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
index 72f778286abe4..a4b18e648b7a2 100644
--- a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
+++ b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
@@ -195,12 +195,9 @@ body: |
; CHECK: bb.4:
; CHECK: successors: %bb.5(0x80000000)
; CHECK: %44:g8rc_and_g8rc_nox0 = COPY killed %59
- ; CHECK: %43:gprc = COPY killed %57
- ; CHECK: %41:gprc = COPY killed %60
- ; CHECK: %39:g8rc = COPY killed %44
- ; CHECK: %61:gprc = COPY killed %41
- ; CHECK: %62:g8rc_and_g8rc_nox0 = COPY killed %39
- ; CHECK: %63:gprc = COPY killed %43
+ ; CHECK: %63:gprc = COPY killed %57
+ ; CHECK: %61:gprc = COPY killed %60
+ ; CHECK: %62:g8rc_and_g8rc_nox0 = COPY killed %44
; CHECK: bb.5:
; CHECK: successors: %bb.6(0x80000000)
diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir b/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
index 474c288bba88b..4cad98eeade77 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
@@ -1,21 +1,10 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple powerpc64le-unknown-linux-gnu -start-after=codegenprepare \
# RUN: -o - %s -verify-machineinstrs | FileCheck %s
--- |
define ppc_fp128 @freeze_select(ppc_fp128 %a, ppc_fp128 %b) {
- %sel.frozen = freeze ppc_fp128 %a
- %cmp = fcmp one ppc_fp128 %sel.frozen, 0xM00000000000000000000000000000000
- br i1 %cmp, label %select.end, label %select.false
-
- select.false: ; preds = %0
- br label %select.end
-
- select.end: ; preds = %0, %select.false
- %sel = phi ppc_fp128 [ %a, %0 ], [ %b, %select.false ]
- ret ppc_fp128 %sel
- }
-
- ; CHECK-LABEL: freeze_select
+ ; CHECK-LABEL: freeze_select:
; CHECK: # %bb.0:
; CHECK-NEXT: xxlxor 0, 0, 0
; CHECK-NEXT: fcmpu 1, 2, 2
@@ -28,8 +17,19 @@
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: crnor 20, 7, 2
; CHECK-NEXT: bclr 12, 20, 0
- ; CHECK-NEXT: # %bb.2: # %select.false
- ; CHECK-NEXT: fmr 1, 3
+ ; CHECK-NEXT: # %bb.2: # %select.false
; CHECK-NEXT: fmr 2, 4
+ ; CHECK-NEXT: fmr 1, 3
; CHECK-NEXT: blr
+ %sel.frozen = freeze ppc_fp128 %a
+ %cmp = fcmp one ppc_fp128 %sel.frozen, 0xM00000000000000000000000000000000
+ br i1 %cmp, label %select.end, label %select.false
+
+ select.false: ; preds = %0
+ br label %select.end
+
+ select.end: ; preds = %0, %select.false
+ %sel = phi ppc_fp128 [ %a, %0 ], [ %b, %select.false ]
+ ret ppc_fp128 %sel
+ }
...
diff --git a/llvm/test/CodeGen/PowerPC/pr116071.ll b/llvm/test/CodeGen/PowerPC/pr116071.ll
index 29f11fc1d3a63..5db84436c22f6 100644
--- a/llvm/test/CodeGen/PowerPC/pr116071.ll
+++ b/llvm/test/CodeGen/PowerPC/pr116071.ll
@@ -1,9 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -disable-ppc-vsx-fma-mutation=false -mcpu=pwr10 -verify-machineinstrs \
-; RUN: -ppc-asm-full-reg-names -mtriple powerpc64-ibm-aix7.2.0.0 < %s | FileCheck %s
+; RUN: -ppc-asm-full-reg-names -mtriple powerpc64-ibm-aix7.2.0.0 < %s | FileCheck %s
target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
define void @initial(<2 x double> %0){
+; CHECK-LABEL: initial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xxlxor vs0, vs0, vs0
+; CHECK-NEXT: xxlxor f2, f2, f2
+; CHECK-NEXT: xxlxor f4, f4, f4
+; CHECK-NEXT: xxlxor f3, f3, f3
+; CHECK-NEXT: xvmuldp vs1, vs34, vs0
+; CHECK-NEXT: .align 5
+; CHECK-NEXT: L..BB0_1: # %for.cond251.preheader.lr.ph
+; CHECK-NEXT: #
+; CHECK-NEXT: fmr f5, f3
+; CHECK-NEXT: xsadddp f3, f3, f4
+; CHECK-NEXT: fmr f4, f5
+; CHECK-NEXT: xxmrghd vs3, vs3, vs2
+; CHECK-NEXT: xvmaddmdp vs3, vs0, vs1
+; CHECK-NEXT: b L..BB0_1
entry:
%1 = fmul <2 x double> %0, zeroinitializer
br label %for.cond251.preheader.lr.ph
@@ -18,9 +35,3 @@ for.cond251.preheader.lr.ph: ; preds = %for.cond251.prehead
%7 = extractelement <2 x double> %6, i64 0
br label %for.cond251.preheader.lr.ph
}
-
-; CHECK: xsadddp f4, f3, f4
-; CHECK-NEXT: xxmrghd vs5, vs4, vs2
-; CHECK-NEXT: fmr f4, f3
-; CHECK-NEXT: xvmaddmdp vs5, vs0, vs1
-; CHECK-NEXT: fmr f3, f5
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
index 4904d11fc8104..0077673292ab3 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
@@ -9,7 +9,7 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr {
; CHECK-NEXT: li 5, 55
; CHECK-NEXT: li 6, 48
; CHECK-NEXT: mtctr 3
-; CHECK-NEXT: bdz .LBB0_4
+; CHECK-NEXT: bdz .LBB0_3
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: divw 9, 8, 4
; CHECK-NEXT: mullw 7, 8, 4
@@ -19,7 +19,7 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr {
; CHECK-NEXT: add 3, 7, 3
; CHECK-NEXT: stbu 3, -1(7)
; CHECK-NEXT: mr 3, 8
-; CHECK-NEXT: bdz .LBB0_3
+; CHECK-NEXT: bdz .LBB0_4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: mr 3, 9
@@ -33,13 +33,12 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr {
; CHECK-NEXT: stbu 8, -1(7)
; CHECK-NEXT: mr 8, 3
; CHECK-NEXT: bdnz .LBB0_2
+; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: mr 8, 9
-; CHECK-NEXT: b .LBB0_5
-; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: # implicit-def: $x7
-; CHECK-NEXT: .LBB0_5:
-; CHECK-NEXT: mullw 4, 8, 4
+; CHECK-NEXT: mr 9, 8
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: mullw 4, 9, 4
; CHECK-NEXT: sub 3, 3, 4
; CHECK-NEXT: cmplwi 3, 10
; CHECK-NEXT: isellt 4, 6, 5
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
index 628822edabf39..2dd8b36389c62 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
@@ -19,34 +19,34 @@ define void @phi3(ptr) nounwind {
; CHECK-NEXT: mr 29, 3
; CHECK-NEXT: bl malloc
; CHECK-NEXT: nop
-; CHECK-NEXT: addi 7, 30, -4
+; CHECK-NEXT: addi 6, 30, -4
; CHECK-NEXT: mtctr 3
; CHECK-NEXT: addi 4, 29, -8
; CHECK-NEXT: li 5, 0
-; CHECK-NEXT: lwzu 8, 4(7)
+; CHECK-NEXT: lwzu 8, 4(6)
; CHECK-NEXT: bdz .LBB0_5
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: extswsli 6, 5, 5
+; CHECK-NEXT: extswsli 7, 5, 5
; CHECK-NEXT: add 5, 8, 5
-; CHECK-NEXT: lwzu 8, 4(7)
+; CHECK-NEXT: lwzu 8, 4(6)
; CHECK-NEXT: bdz .LBB0_4
; CHECK-NEXT: # %bb.2:
-; CHECK-NEXT: add 6, 3, 6
-; CHECK-NEXT: stdu 6, 8(4)
-; CHECK-NEXT: extswsli 6, 5, 5
+; CHECK-NEXT: add 7, 3, 7
+; CHECK-NEXT: stdu 7, 8(4)
+; CHECK-NEXT: extswsli 7, 5, 5
; CHECK-NEXT: add 5, 8, 5
-; CHECK-NEXT: lwzu 8, 4(7)
+; CHECK-NEXT: lwzu 8, 4(6)
; CHECK-NEXT: bdz .LBB0_4
; CHECK-NEXT: .p2align 5
; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: add 9, 3, 6
-; CHECK-NEXT: extswsli 6, 5, 5
+; CHECK-NEXT: add 9, 3, 7
+; CHECK-NEXT: extswsli 7, 5, 5
; CHECK-NEXT: add 5, 8, 5
-; CHECK-NEXT: lwzu 8, 4(7)
+; CHECK-NEXT: lwzu 8, 4(6)
; CHECK-NEXT: stdu 9, 8(4)
; CHECK-NEXT: bdnz .LBB0_3
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: add 6, 3, 6
+; CHECK-NEXT: add 6, 3, 7
; CHECK-NEXT: stdu 6, 8(4)
; CHECK-NEXT: .LBB0_5:
; CHECK-NEXT: extswsli 5, 5, 5
diff --git a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
index e225e63980c7f..eec5b4588f7c3 100644
--- a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
+++ b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
@@ -16,13 +16,12 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar
; CHECK-NEXT: stw 12, 8(1)
; CHECK-NEXT: mflr 0
; CHECK-NEXT: stdu 1, -784(1)
-; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3
-; CHECK-NEXT: cmpwi 2, 3, 2
-; CHECK-NEXT: li 4, 0
-; CHECK-NEXT: # kill: def $r4 killed $r4 killed $x4
-; CHECK-NEXT: mr 3, 4
+; CHECK-NEXT: mr 4, 3
; CHECK-NEXT: std 0, 800(1)
; CHECK-NEXT: mr 31, 1
+; CHECK-NEXT: li 3, 0
+; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3
+; CHECK-NEXT: cmpwi 2, 4, 2
; CHECK-NEXT: blt 2, .LBB0_3
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: addi 3, 31, 112
@@ -66,7 +65,6 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar
; BE-NEXT: stdu 1, -800(1)
; BE-NEXT: li 4, 0
; BE-NEXT: # kill: def $r3 killed $r3 killed $x3
-; BE-NEXT: # kill: def $r4 killed $r4 killed $x4
; BE-NEXT: cmpwi 2, 3, 2
; BE-NEXT: mr 3, 4
; BE-NEXT: std 0, 816(1)
diff --git a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
index f696745c9d414..10fa8221778f5 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
+++ b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -60,27 +60,27 @@ define void @jbd2_journal_commit_transaction(i32 %input1, ptr %input2, ptr %inpu
; CHECK-NO-ISEL-NEXT: bne- 0, .Ltmp0
; CHECK-NO-ISEL-EMPTY:
; CHECK-NO-ISEL-NEXT: #NO_APP
-; CHECK-NO-ISEL-NEXT: std 5, 0(6)
+; CHECK-NO-ISEL-NEXT: std 4, 0(6)
; CHECK-NO-ISEL-NEXT: beq- 5, .LBB0_6
; CHECK-NO-ISEL-NEXT: .LBB0_2: # %while.body392
; CHECK-NO-ISEL-NEXT: #
; CHECK-NO-ISEL-NEXT: bne- 1, .LBB0_5
; CHECK-NO-ISEL-NEXT: # %bb.3: # %wait_on_buffer.exit1319
; CHECK-NO-ISEL-NEXT: #
-; CHECK-NO-ISEL-NEXT: ld 5, 0(6)
-; CHECK-NO-ISEL-NEXT: mr 9, 5
-; CHECK-NO-ISEL-NEXT: ldu 4, -72(9)
-; CHECK-NO-ISEL-NEXT: andi. 4, 4, 1
-; CHECK-NO-ISEL-NEXT: mr 4, 3
+; CHECK-NO-ISEL-NEXT: ld 4, 0(6)
+; CHECK-NO-ISEL-NEXT: mr 9, 4
+; CHECK-NO-ISEL-NEXT: ldu 5, -72(9)
+; CHECK-NO-ISEL-NEXT: andi. 5, 5, 1
+; CHECK-NO-ISEL-NEXT: mr 5, 3
; CHECK-NO-ISEL-NEXT: bc 12, 1, .LBB0_1
; CHECK-NO-ISEL-NEXT: # %bb.4: # %wait_on_buffer.exit1319
; CHECK-NO-ISEL-NEXT: #
-; CHECK-NO-ISEL-NEXT: li 4, -5
+; CHECK-NO-ISEL-NEXT: li 5, -5
; CHECK-NO-ISEL-NEXT: b .LBB0_1
; CHECK-NO-ISEL-NEXT: .LBB0_5:
-; CHECK-NO-ISEL-NEXT: mr 4, 7
+; CHECK-NO-ISEL-NEXT: mr 5, 7
; CHECK-NO-ISEL-NEXT: .LBB0_6: # %while.end418
-; CHECK-NO-ISEL-NEXT: cmplwi 4, 0
+; CHECK-NO-ISEL-NEXT: cmplwi 5, 0
; CHECK-NO-ISEL-NEXT: beq 0, .LBB0_8
; CHECK-NO-ISEL-NEXT: # %bb.7: # %if.then420
; CHECK-NO-ISEL-NEXT: .LBB0_8: # %if.end421
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 14b3d69f8c273..9e0dc87f0ab8b 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -2488,8 +2488,8 @@ define double @test82(double %a, double %b, double %c, double %d) {
; CHECK-FISL: # %bb.0: # %entry
; CHECK-FISL-NEXT: stfd f2, -16(r1) # 8-byte Folded Spill
; CHECK-FISL-NEXT: fmr f2, f1
-; CHECK-FISL-NEXT: xscmpudp cr0, f3, f4
; CHECK-FISL-NEXT: stfd f2, -8(r1) # 8-byte Folded Spill
+; CHECK-FISL-NEXT: xscmpudp cr0, f3, f4
; CHECK-FISL-NEXT: beq cr0, .LBB67_2
; CHECK-FISL-NEXT: # %bb.1: # %entry
; CHECK-FISL-NEXT: lfd f0, -16(r1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index efb4e1a6f15d6..2133ffeeb19dc 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -543,17 +543,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw t1, 12(a2)
; RV32I-NEXT: lw a2, 4(a2)
; RV32I-NEXT: sltu t0, a6, a5
-; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: mv t3, t0
; RV32I-NEXT: beq a7, t1, .LBB11_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: slt t4, t1, a7
+; RV32I-NEXT: slt t3, t1, a7
; RV32I-NEXT: .LBB11_2:
; RV32I-NEXT: sltu t2, a1, a3
; RV32I-NEXT: sltu t5, a2, a4
-; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: mv t4, t2
; RV32I-NEXT: beq a4, a2, .LBB11_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: mv t4, t5
; RV32I-NEXT: .LBB11_4:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -562,12 +562,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: beqz t6, .LBB11_6
; RV32I-NEXT: # %bb.5:
-; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: mv t4, t3
; RV32I-NEXT: .LBB11_6:
-; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: mv t3, t2
; RV32I-NEXT: beq a2, a4, .LBB11_8
; RV32I-NEXT: # %bb.7:
-; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: mv t3, t5
; RV32I-NEXT: .LBB11_8:
; RV32I-NEXT: sltu t5, a3, a1
; RV32I-NEXT: mv t6, t5
@@ -575,17 +575,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: # %bb.9:
; RV32I-NEXT: sltu t6, a4, a2
; RV32I-NEXT: .LBB11_10:
-; RV32I-NEXT: bnez t3, .LBB11_12
+; RV32I-NEXT: bnez t4, .LBB11_12
; RV32I-NEXT: # %bb.11:
; RV32I-NEXT: sub a7, t1, a7
; RV32I-NEXT: sub a5, a6, a5
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a2, a2, a4
; RV32I-NEXT: sub a4, a7, t0
-; RV32I-NEXT: sltu a6, a5, t4
+; RV32I-NEXT: sltu a6, a5, t3
; RV32I-NEXT: sub a3, a2, t2
; RV32I-NEXT: sub a2, a4, a6
-; RV32I-NEXT: sub a4, a5, t4
+; RV32I-NEXT: sub a4, a5, t3
; RV32I-NEXT: j .LBB11_13
; RV32I-NEXT: .LBB11_12:
; RV32I-NEXT: sltu t0, a5, a6
@@ -639,17 +639,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: lw t1, 12(a2)
; RV32ZBB-NEXT: lw a2, 4(a2)
; RV32ZBB-NEXT: sltu t0, a6, a5
-; RV32ZBB-NEXT: mv t4, t0
+; RV32ZBB-NEXT: mv t3, t0
; RV32ZBB-NEXT: beq a7, t1, .LBB11_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: slt t4, t1, a7
+; RV32ZBB-NEXT: slt t3, t1, a7
; RV32ZBB-NEXT: .LBB11_2:
; RV32ZBB-NEXT: sltu t2, a1, a3
; RV32ZBB-NEXT: sltu t5, a2, a4
-; RV32ZBB-NEXT: mv t3, t2
+; RV32ZBB-NEXT: mv t4, t2
; RV32ZBB-NEXT: beq a4, a2, .LBB11_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: mv t3, t5
+; RV32ZBB-NEXT: mv t4, t5
; RV32ZBB-NEXT: .LBB11_4:
; RV32ZBB-NEXT: addi sp, sp, -16
; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -658,12 +658,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: or t6, s0, t6
; RV32ZBB-NEXT: beqz t6, .LBB11_6
; RV32ZBB-NEXT: # %bb.5:
-; RV32ZBB-NEXT: mv t3, t4
+; RV32ZBB-NEXT: mv t4, t3
; RV32ZBB-NEXT: .LBB11_6:
-; RV32ZBB-NEXT: mv t4, t2
+; RV32ZBB-NEXT: mv t3, t2
; RV32ZBB-NEXT: beq a2, a4, .LBB11_8
; RV32ZBB-NEXT: # %bb.7:
-; RV32ZBB-NEXT: mv t4, t5
+; RV32ZBB-NEXT: mv t3, t5
; RV32ZBB-NEXT: .LBB11_8:
; RV32ZBB-NEXT: sltu t5, a3, a1
; RV32ZBB-NEXT: mv t6, t5
@@ -671,17 +671,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: # %bb.9:
; RV32ZBB-NEXT: sltu t6, a4, a2
; RV32ZBB-NEXT: .LBB11_10:
-; RV32ZBB-NEXT: bnez t3, .LBB11_12
+; RV32ZBB-NEXT: bnez t4, .LBB11_12
; RV32ZBB-NEXT: # %bb.11:
; RV32ZBB-NEXT: sub a7, t1, a7
; RV32ZBB-NEXT: sub a5, a6, a5
; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a2, a2, a4
; RV32ZBB-NEXT: sub a4, a7, t0
-; RV32ZBB-NEXT: sltu a6, a5, t4
+; RV32ZBB-NEXT: sltu a6, a5, t3
; RV32ZBB-NEXT: sub a3, a2, t2
; RV32ZBB-NEXT: sub a2, a4, a6
-; RV32ZBB-NEXT: sub a4, a5, t4
+; RV32ZBB-NEXT: sub a4, a5, t3
; RV32ZBB-NEXT: j .LBB11_13
; RV32ZBB-NEXT: .LBB11_12:
; RV32ZBB-NEXT: sltu t0, a5, a6
@@ -743,17 +743,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw t1, 12(a2)
; RV32I-NEXT: lw a2, 4(a2)
; RV32I-NEXT: sltu t0, a6, a5
-; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: mv t3, t0
; RV32I-NEXT: beq a7, t1, .LBB12_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: slt t4, t1, a7
+; RV32I-NEXT: slt t3, t1, a7
; RV32I-NEXT: .LBB12_2:
; RV32I-NEXT: sltu t2, a1, a3
; RV32I-NEXT: sltu t5, a2, a4
-; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: mv t4, t2
; RV32I-NEXT: beq a4, a2, .LBB12_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: mv t4, t5
; RV32I-NEXT: .LBB12_4:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -762,12 +762,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: beqz t6, .LBB12_6
; RV32I-NEXT: # %bb.5:
-; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: mv t4, t3
; RV32I-NEXT: .LBB12_6:
-; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: mv t3, t2
; RV32I-NEXT: beq a2, a4, .LBB12_8
; RV32I-NEXT: # %bb.7:
-; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: mv t3, t5
; RV32I-NEXT: .LBB12_8:
; RV32I-NEXT: sltu t5, a3, a1
; RV32I-NEXT: mv t6, t5
@@ -775,17 +775,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: # %bb.9:
; RV32I-NEXT: sltu t6, a4, a2
; RV32I-NEXT: .LBB12_10:
-; RV32I-NEXT: bnez t3, .LBB12_12
+; RV32I-NEXT: bnez t4, .LBB12_12
; RV32I-NEXT: # %bb.11:
; RV32I-NEXT: sub a7, t1, a7
; RV32I-NEXT: sub a5, a6, a5
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a2, a2, a4
; RV32I-NEXT: sub a4, a7, t0
-; RV32I-NEXT: sltu a6, a5, t4
+; RV32I-NEXT: sltu a6, a5, t3
; RV32I-NEXT: sub a3, a2, t2
; RV32I-NEXT: sub a2, a4, a6
-; RV32I-NEXT: sub a4, a5, t4
+; RV32I-NEXT: sub a4, a5, t3
; RV32I-NEXT: j .LBB12_13
; RV32I-NEXT: .LBB12_12:
; RV32I-NEXT: sltu t0, a5, a6
@@ -839,17 +839,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: lw t1, 12(a2)
; RV32ZBB-NEXT: lw a2, 4(a2)
; RV32ZBB-NEXT: sltu t0, a6, a5
-; RV32ZBB-NEXT: mv t4, t0
+; RV32ZBB-NEXT: mv t3, t0
; RV32ZBB-NEXT: beq a7, t1, .LBB12_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: slt t4, t1, a7
+; RV32ZBB-NEXT: slt t3, t1, a7
; RV32ZBB-NEXT: .LBB12_2:
; RV32ZBB-NEXT: sltu t2, a1, a3
; RV32ZBB-NEXT: sltu t5, a2, a4
-; RV32ZBB-NEXT: mv t3, t2
+; RV32ZBB-NEXT: mv t4, t2
; RV32ZBB-NEXT: beq a4, a2, .LBB12_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: mv t3, t5
+; RV32ZBB-NEXT: mv t4, t5
; RV32ZBB-NEXT: .LBB12_4:
; RV32ZBB-NEXT: addi sp, sp, -16
; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -858,12 +858,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: or t6, s0, t6
; RV32ZBB-NEXT: beqz t6, .LBB12_6
; RV32ZBB-NEXT: # %bb.5:
-; RV32ZBB-NEXT: mv t3, t4
+; RV32ZBB-NEXT: mv t4, t3
; RV32ZBB-NEXT: .LBB12_6:
-; RV32ZBB-NEXT: mv t4, t2
+; RV32ZBB-NEXT: mv t3, t2
; RV32ZBB-NEXT: beq a2, a4, .LBB12_8
; RV32ZBB-NEXT: # %bb.7:
-; RV32ZBB-NEXT: mv t4, t5
+; RV32ZBB-NEXT: mv t3, t5
; RV32ZBB-NEXT: .LBB12_8:
; RV32ZBB-NEXT: sltu t5, a3, a1
; RV32ZBB-NEXT: mv t6, t5
@@ -871,17 +871,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: # %bb.9:
; RV32ZBB-NEXT: sltu t6, a4, a2
; RV32ZBB-NEXT: .LBB12_10:
-; RV32ZBB-NEXT: bnez t3, .LBB12_12
+; RV32ZBB-NEXT: bnez t4, .LBB12_12
; RV32ZBB-NEXT: # %bb.11:
; RV32ZBB-NEXT: sub a7, t1, a7
; RV32ZBB-NEXT: sub a5, a6, a5
; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a2, a2, a4
; RV32ZBB-NEXT: sub a4, a7, t0
-; RV32ZBB-NEXT: sltu a6, a5, t4
+; RV32ZBB-NEXT: sltu a6, a5, t3
; RV32ZBB-NEXT: sub a3, a2, t2
; RV32ZBB-NEXT: sub a2, a4, a6
-; RV32ZBB-NEXT: sub a4, a5, t4
+; RV32ZBB-NEXT: sub a4, a5, t3
; RV32ZBB-NEXT: j .LBB12_13
; RV32ZBB-NEXT: .LBB12_12:
; RV32ZBB-NEXT: sltu t0, a5, a6
@@ -1132,17 +1132,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw t1, 12(a2)
; RV32I-NEXT: lw a2, 4(a2)
; RV32I-NEXT: sltu t0, a6, a5
-; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: mv t3, t0
; RV32I-NEXT: beq a7, t1, .LBB17_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: slt t4, t1, a7
+; RV32I-NEXT: slt t3, t1, a7
; RV32I-NEXT: .LBB17_2:
; RV32I-NEXT: sltu t2, a1, a3
; RV32I-NEXT: sltu t5, a2, a4
-; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: mv t4, t2
; RV32I-NEXT: beq a4, a2, .LBB17_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: mv t4, t5
; RV32I-NEXT: .LBB17_4:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -1151,12 +1151,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: beqz t6, .LBB17_6
; RV32I-NEXT: # %bb.5:
-; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: mv t4, t3
; RV32I-NEXT: .LBB17_6:
-; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: mv t3, t2
; RV32I-NEXT: beq a2, a4, .LBB17_8
; RV32I-NEXT: # %bb.7:
-; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: mv t3, t5
; RV32I-NEXT: .LBB17_8:
; RV32I-NEXT: sltu t5, a3, a1
; RV32I-NEXT: mv t6, t5
@@ -1164,17 +1164,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: # %bb.9:
; RV32I-NEXT: sltu t6, a4, a2
; RV32I-NEXT: .LBB17_10:
-; RV32I-NEXT: bnez t3, .LBB17_12
+; RV32I-NEXT: bnez t4, .LBB17_12
; RV32I-NEXT: # %bb.11:
; RV32I-NEXT: sub a7, t1, a7
; RV32I-NEXT: sub a5, a6, a5
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a2, a2, a4
; RV32I-NEXT: sub a4, a7, t0
-; RV32I-NEXT: sltu a6, a5, t4
+; RV32I-NEXT: sltu a6, a5, t3
; RV32I-NEXT: sub a3, a2, t2
; RV32I-NEXT: sub a2, a4, a6
-; RV32I-NEXT: sub a4, a5, t4
+; RV32I-NEXT: sub a4, a5, t3
; RV32I-NEXT: j .LBB17_13
; RV32I-NEXT: .LBB17_12:
; RV32I-NEXT: sltu t0, a5, a6
@@ -1228,17 +1228,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: lw t1, 12(a2)
; RV32ZBB-NEXT: lw a2, 4(a2)
; RV32ZBB-NEXT: sltu t0, a6, a5
-; RV32ZBB-NEXT: mv t4, t0
+; RV32ZBB-NEXT: mv t3, t0
; RV32ZBB-NEXT: beq a7, t1, .LBB17_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: slt t4, t1, a7
+; RV32ZBB-NEXT: slt t3, t1, a7
; RV32ZBB-NEXT: .LBB17_2:
; RV32ZBB-NEXT: sltu t2, a1, a3
; RV32ZBB-NEXT: sltu t5, a2, a4
-; RV32ZBB-NEXT: mv t3, t2
+; RV32ZBB-NEXT: mv t4, t2
; RV32ZBB-NEXT: beq a4, a2, .LBB17_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: mv t3, t5
+; RV32ZBB-NEXT: mv t4, t5
; RV32ZBB-NEXT: .LBB17_4:
; RV32ZBB-NEXT: addi sp, sp, -16
; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -1247,12 +1247,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: or t6, s0, t6
; RV32ZBB-NEXT: beqz t6, .LBB17_6
; RV32ZBB-NEXT: # %bb.5:
-; RV32ZBB-NEXT: mv t3, t4
+; RV32ZBB-NEXT: mv t4, t3
; RV32ZBB-NEXT: .LBB17_6:
-; RV32ZBB-NEXT: mv t4, t2
+; RV32ZBB-NEXT: mv t3, t2
; RV32ZBB-NEXT: beq a2, a4, .LBB17_8
; RV32ZBB-NEXT: # %bb.7:
-; RV32ZBB-NEXT: mv t4, t5
+; RV32ZBB-NEXT: mv t3, t5
; RV32ZBB-NEXT: .LBB17_8:
; RV32ZBB-NEXT: sltu t5, a3, a1
; RV32ZBB-NEXT: mv t6, t5
@@ -1260,17 +1260,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: # %bb.9:
; RV32ZBB-NEXT: sltu t6, a4, a2
; RV32ZBB-NEXT: .LBB17_10:
-; RV32ZBB-NEXT: bnez t3, .LBB17_12
+; RV32ZBB-NEXT: bnez t4, .LBB17_12
; RV32ZBB-NEXT: # %bb.11:
; RV32ZBB-NEXT: sub a7, t1, a7
; RV32ZBB-NEXT: sub a5, a6, a5
; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a2, a2, a4
; RV32ZBB-NEXT: sub a4, a7, t0
-; RV32ZBB-NEXT: sltu a6, a5, t4
+; RV32ZBB-NEXT: sltu a6, a5, t3
; RV32ZBB-NEXT: sub a3, a2, t2
; RV32ZBB-NEXT: sub a2, a4, a6
-; RV32ZBB-NEXT: sub a4, a5, t4
+; RV32ZBB-NEXT: sub a4, a5, t3
; RV32ZBB-NEXT: j .LBB17_13
; RV32ZBB-NEXT: .LBB17_12:
; RV32ZBB-NEXT: sltu t0, a5, a6
@@ -1523,17 +1523,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw t1, 12(a2)
; RV32I-NEXT: lw a2, 4(a2)
; RV32I-NEXT: sltu t0, a6, a5
-; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: mv t3, t0
; RV32I-NEXT: beq a7, t1, .LBB22_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: slt t4, t1, a7
+; RV32I-NEXT: slt t3, t1, a7
; RV32I-NEXT: .LBB22_2:
; RV32I-NEXT: sltu t2, a1, a3
; RV32I-NEXT: sltu t5, a2, a4
-; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: mv t4, t2
; RV32I-NEXT: beq a4, a2, .LBB22_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: mv t4, t5
; RV32I-NEXT: .LBB22_4:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -1542,12 +1542,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: beqz t6, .LBB22_6
; RV32I-NEXT: # %bb.5:
-; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: mv t4, t3
; RV32I-NEXT: .LBB22_6:
-; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: mv t3, t2
; RV32I-NEXT: beq a2, a4, .LBB22_8
; RV32I-NEXT: # %bb.7:
-; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: mv t3, t5
; RV32I-NEXT: .LBB22_8:
; RV32I-NEXT: sltu t5, a3, a1
; RV32I-NEXT: mv t6, t5
@@ -1555,17 +1555,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: # %bb.9:
; RV32I-NEXT: sltu t6, a4, a2
; RV32I-NEXT: .LBB22_10:
-; RV32I-NEXT: bnez t3, .LBB22_12
+; RV32I-NEXT: bnez t4, .LBB22_12
; RV32I-NEXT: # %bb.11:
; RV32I-NEXT: sub a7, t1, a7
; RV32I-NEXT: sub a5, a6, a5
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a2, a2, a4
; RV32I-NEXT: sub a4, a7, t0
-; RV32I-NEXT: sltu a6, a5, t4
+; RV32I-NEXT: sltu a6, a5, t3
; RV32I-NEXT: sub a3, a2, t2
; RV32I-NEXT: sub a2, a4, a6
-; RV32I-NEXT: sub a4, a5, t4
+; RV32I-NEXT: sub a4, a5, t3
; RV32I-NEXT: j .LBB22_13
; RV32I-NEXT: .LBB22_12:
; RV32I-NEXT: sltu t0, a5, a6
@@ -1619,17 +1619,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: lw t1, 12(a2)
; RV32ZBB-NEXT: lw a2, 4(a2)
; RV32ZBB-NEXT: sltu t0, a6, a5
-; RV32ZBB-NEXT: mv t4, t0
+; RV32ZBB-NEXT: mv t3, t0
; RV32ZBB-NEXT: beq a7, t1, .LBB22_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: slt t4, t1, a7
+; RV32ZBB-NEXT: slt t3, t1, a7
; RV32ZBB-NEXT: .LBB22_2:
; RV32ZBB-NEXT: sltu t2, a1, a3
; RV32ZBB-NEXT: sltu t5, a2, a4
-; RV32ZBB-NEXT: mv t3, t2
+; RV32ZBB-NEXT: mv t4, t2
; RV32ZBB-NEXT: beq a4, a2, .LBB22_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: mv t3, t5
+; RV32ZBB-NEXT: mv t4, t5
; RV32ZBB-NEXT: .LBB22_4:
; RV32ZBB-NEXT: addi sp, sp, -16
; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -1638,12 +1638,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: or t6, s0, t6
; RV32ZBB-NEXT: beqz t6, .LBB22_6
; RV32ZBB-NEXT: # %bb.5:
-; RV32ZBB-NEXT: mv t3, t4
+; RV32ZBB-NEXT: mv t4, t3
; RV32ZBB-NEXT: .LBB22_6:
-; RV32ZBB-NEXT: mv t4, t2
+; RV32ZBB-NEXT: mv t3, t2
; RV32ZBB-NEXT: beq a2, a4, .LBB22_8
; RV32ZBB-NEXT: # %bb.7:
-; RV32ZBB-NEXT: mv t4, t5
+; RV32ZBB-NEXT: mv t3, t5
; RV32ZBB-NEXT: .LBB22_8:
; RV32ZBB-NEXT: sltu t5, a3, a1
; RV32ZBB-NEXT: mv t6, t5
@@ -1651,17 +1651,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: # %bb.9:
; RV32ZBB-NEXT: sltu t6, a4, a2
; RV32ZBB-NEXT: .LBB22_10:
-; RV32ZBB-NEXT: bnez t3, .LBB22_12
+; RV32ZBB-NEXT: bnez t4, .LBB22_12
; RV32ZBB-NEXT: # %bb.11:
; RV32ZBB-NEXT: sub a7, t1, a7
; RV32ZBB-NEXT: sub a5, a6, a5
; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a2, a2, a4
; RV32ZBB-NEXT: sub a4, a7, t0
-; RV32ZBB-NEXT: sltu a6, a5, t4
+; RV32ZBB-NEXT: sltu a6, a5, t3
; RV32ZBB-NEXT: sub a3, a2, t2
; RV32ZBB-NEXT: sub a2, a4, a6
-; RV32ZBB-NEXT: sub a4, a5, t4
+; RV32ZBB-NEXT: sub a4, a5, t3
; RV32ZBB-NEXT: j .LBB22_13
; RV32ZBB-NEXT: .LBB22_12:
; RV32ZBB-NEXT: sltu t0, a5, a6
@@ -2546,17 +2546,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw t1, 12(a2)
; RV32I-NEXT: lw a2, 4(a2)
; RV32I-NEXT: sltu t0, a6, a5
-; RV32I-NEXT: mv t4, t0
+; RV32I-NEXT: mv t3, t0
; RV32I-NEXT: beq a7, t1, .LBB38_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: slt t4, t1, a7
+; RV32I-NEXT: slt t3, t1, a7
; RV32I-NEXT: .LBB38_2:
; RV32I-NEXT: sltu t2, a1, a3
; RV32I-NEXT: sltu t5, a2, a4
-; RV32I-NEXT: mv t3, t2
+; RV32I-NEXT: mv t4, t2
; RV32I-NEXT: beq a4, a2, .LBB38_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: mv t3, t5
+; RV32I-NEXT: mv t4, t5
; RV32I-NEXT: .LBB38_4:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -2565,12 +2565,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: or t6, s0, t6
; RV32I-NEXT: beqz t6, .LBB38_6
; RV32I-NEXT: # %bb.5:
-; RV32I-NEXT: mv t3, t4
+; RV32I-NEXT: mv t4, t3
; RV32I-NEXT: .LBB38_6:
-; RV32I-NEXT: mv t4, t2
+; RV32I-NEXT: mv t3, t2
; RV32I-NEXT: beq a2, a4, .LBB38_8
; RV32I-NEXT: # %bb.7:
-; RV32I-NEXT: mv t4, t5
+; RV32I-NEXT: mv t3, t5
; RV32I-NEXT: .LBB38_8:
; RV32I-NEXT: sltu t5, a3, a1
; RV32I-NEXT: mv t6, t5
@@ -2578,17 +2578,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: # %bb.9:
; RV32I-NEXT: sltu t6, a4, a2
; RV32I-NEXT: .LBB38_10:
-; RV32I-NEXT: bnez t3, .LBB38_12
+; RV32I-NEXT: bnez t4, .LBB38_12
; RV32I-NEXT: # %bb.11:
; RV32I-NEXT: sub a7, t1, a7
; RV32I-NEXT: sub a5, a6, a5
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: sub a2, a2, a4
; RV32I-NEXT: sub a4, a7, t0
-; RV32I-NEXT: sltu a6, a5, t4
+; RV32I-NEXT: sltu a6, a5, t3
; RV32I-NEXT: sub a3, a2, t2
; RV32I-NEXT: sub a2, a4, a6
-; RV32I-NEXT: sub a4, a5, t4
+; RV32I-NEXT: sub a4, a5, t3
; RV32I-NEXT: j .LBB38_13
; RV32I-NEXT: .LBB38_12:
; RV32I-NEXT: sltu t0, a5, a6
@@ -2642,17 +2642,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: lw t1, 12(a2)
; RV32ZBB-NEXT: lw a2, 4(a2)
; RV32ZBB-NEXT: sltu t0, a6, a5
-; RV32ZBB-NEXT: mv t4, t0
+; RV32ZBB-NEXT: mv t3, t0
; RV32ZBB-NEXT: beq a7, t1, .LBB38_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: slt t4, t1, a7
+; RV32ZBB-NEXT: slt t3, t1, a7
; RV32ZBB-NEXT: .LBB38_2:
; RV32ZBB-NEXT: sltu t2, a1, a3
; RV32ZBB-NEXT: sltu t5, a2, a4
-; RV32ZBB-NEXT: mv t3, t2
+; RV32ZBB-NEXT: mv t4, t2
; RV32ZBB-NEXT: beq a4, a2, .LBB38_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: mv t3, t5
+; RV32ZBB-NEXT: mv t4, t5
; RV32ZBB-NEXT: .LBB38_4:
; RV32ZBB-NEXT: addi sp, sp, -16
; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
@@ -2661,12 +2661,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: or t6, s0, t6
; RV32ZBB-NEXT: beqz t6, .LBB38_6
; RV32ZBB-NEXT: # %bb.5:
-; RV32ZBB-NEXT: mv t3, t4
+; RV32ZBB-NEXT: mv t4, t3
; RV32ZBB-NEXT: .LBB38_6:
-; RV32ZBB-NEXT: mv t4, t2
+; RV32ZBB-NEXT: mv t3, t2
; RV32ZBB-NEXT: beq a2, a4, .LBB38_8
; RV32ZBB-NEXT: # %bb.7:
-; RV32ZBB-NEXT: mv t4, t5
+; RV32ZBB-NEXT: mv t3, t5
; RV32ZBB-NEXT: .LBB38_8:
; RV32ZBB-NEXT: sltu t5, a3, a1
; RV32ZBB-NEXT: mv t6, t5
@@ -2674,17 +2674,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
; RV32ZBB-NEXT: # %bb.9:
; RV32ZBB-NEXT: sltu t6, a4, a2
; RV32ZBB-NEXT: .LBB38_10:
-; RV32ZBB-NEXT: bnez t3, .LBB38_12
+; RV32ZBB-NEXT: bnez t4, .LBB38_12
; RV32ZBB-NEXT: # %bb.11:
; RV32ZBB-NEXT: sub a7, t1, a7
; RV32ZBB-NEXT: sub a5, a6, a5
; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: sub a2, a2, a4
; RV32ZBB-NEXT: sub a4, a7, t0
-; RV32ZBB-NEXT: sltu a6, a5, t4
+; RV32ZBB-NEXT: sltu a6, a5, t3
; RV32ZBB-NEXT: sub a3, a2, t2
; RV32ZBB-NEXT: sub a2, a4, a6
-; RV32ZBB-NEXT: sub a4, a5, t4
+; RV32ZBB-NEXT: sub a4, a5, t3
; RV32ZBB-NEXT: j .LBB38_13
; RV32ZBB-NEXT: .LBB38_12:
; RV32ZBB-NEXT: sltu t0, a5, a6
diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
index d250098576687..bf81d03ec1352 100644
--- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
@@ -52,18 +52,18 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn
;
; CHECK-PIPELINED-LABEL: test_pipelined_1:
; CHECK-PIPELINED: # %bb.0: # %entry
-; CHECK-PIPELINED-NEXT: blez a2, .LBB1_6
+; CHECK-PIPELINED-NEXT: blez a2, .LBB1_7
; CHECK-PIPELINED-NEXT: # %bb.1: # %for.body.preheader
; CHECK-PIPELINED-NEXT: lw a4, 0(a1)
; CHECK-PIPELINED-NEXT: addi a2, a2, -1
+; CHECK-PIPELINED-NEXT: addi a3, a0, 4
; CHECK-PIPELINED-NEXT: sh2add.uw a6, a2, a1
-; CHECK-PIPELINED-NEXT: addi a2, a0, 4
; CHECK-PIPELINED-NEXT: addi a1, a1, 4
; CHECK-PIPELINED-NEXT: addi a6, a6, 4
; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_5
; CHECK-PIPELINED-NEXT: # %bb.2: # %for.body
; CHECK-PIPELINED-NEXT: lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT: addi a3, a2, 4
+; CHECK-PIPELINED-NEXT: addi a2, a3, 4
; CHECK-PIPELINED-NEXT: addi a4, a4, 1
; CHECK-PIPELINED-NEXT: addi a1, a1, 4
; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_4
@@ -72,20 +72,22 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn
; CHECK-PIPELINED-NEXT: sw a4, 0(a0)
; CHECK-PIPELINED-NEXT: mv a4, a5
; CHECK-PIPELINED-NEXT: lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT: mv a0, a2
-; CHECK-PIPELINED-NEXT: mv a2, a3
-; CHECK-PIPELINED-NEXT: addi a3, a3, 4
+; CHECK-PIPELINED-NEXT: mv a0, a3
+; CHECK-PIPELINED-NEXT: mv a3, a2
+; CHECK-PIPELINED-NEXT: addi a2, a2, 4
; CHECK-PIPELINED-NEXT: addi a4, a4, 1
; CHECK-PIPELINED-NEXT: addi a1, a1, 4
; CHECK-PIPELINED-NEXT: bne a1, a6, .LBB1_3
; CHECK-PIPELINED-NEXT: .LBB1_4:
; CHECK-PIPELINED-NEXT: sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT: mv a0, a2
-; CHECK-PIPELINED-NEXT: mv a4, a5
+; CHECK-PIPELINED-NEXT: j .LBB1_6
; CHECK-PIPELINED-NEXT: .LBB1_5:
-; CHECK-PIPELINED-NEXT: addi a4, a4, 1
-; CHECK-PIPELINED-NEXT: sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT: .LBB1_6: # %for.end
+; CHECK-PIPELINED-NEXT: mv a3, a0
+; CHECK-PIPELINED-NEXT: mv a5, a4
+; CHECK-PIPELINED-NEXT: .LBB1_6:
+; CHECK-PIPELINED-NEXT: addi a5, a5, 1
+; CHECK-PIPELINED-NEXT: sw a5, 0(a3)
+; CHECK-PIPELINED-NEXT: .LBB1_7: # %for.end
; CHECK-PIPELINED-NEXT: ret
entry:
%cmp = icmp sgt i32 %cnt, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 533b8b6864ebc..7fbe60bc9cb34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -6571,53 +6571,53 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F: # %bb.0:
; RV32ZVE32F-NEXT: lw a4, 32(a2)
; RV32ZVE32F-NEXT: lw a5, 40(a2)
-; RV32ZVE32F-NEXT: lw a6, 48(a2)
-; RV32ZVE32F-NEXT: lw a7, 56(a2)
-; RV32ZVE32F-NEXT: lw t0, 0(a2)
+; RV32ZVE32F-NEXT: lw a7, 48(a2)
+; RV32ZVE32F-NEXT: lw t0, 56(a2)
+; RV32ZVE32F-NEXT: lw a6, 0(a2)
; RV32ZVE32F-NEXT: lw t1, 8(a2)
; RV32ZVE32F-NEXT: lw t2, 16(a2)
; RV32ZVE32F-NEXT: lw a2, 24(a2)
; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT: vmv.v.x v8, t0
+; RV32ZVE32F-NEXT: vmv.v.x v8, a6
; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT: vmv.x.s t0, v0
+; RV32ZVE32F-NEXT: vmv.x.s a6, v0
; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t1
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t2
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a2
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5
-; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6
; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, t0
; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT: andi a2, t0, 1
+; RV32ZVE32F-NEXT: andi a2, a6, 1
; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1
; RV32ZVE32F-NEXT: beqz a2, .LBB57_7
; RV32ZVE32F-NEXT: # %bb.1: # %cond.load
; RV32ZVE32F-NEXT: vmv.x.s a2, v8
; RV32ZVE32F-NEXT: lw a1, 0(a2)
; RV32ZVE32F-NEXT: lw a2, 4(a2)
-; RV32ZVE32F-NEXT: andi a4, t0, 2
+; RV32ZVE32F-NEXT: andi a4, a6, 2
; RV32ZVE32F-NEXT: bnez a4, .LBB57_8
; RV32ZVE32F-NEXT: .LBB57_2:
; RV32ZVE32F-NEXT: lw a4, 8(a3)
; RV32ZVE32F-NEXT: lw a5, 12(a3)
-; RV32ZVE32F-NEXT: andi a6, t0, 4
-; RV32ZVE32F-NEXT: bnez a6, .LBB57_9
+; RV32ZVE32F-NEXT: andi a7, a6, 4
+; RV32ZVE32F-NEXT: bnez a7, .LBB57_9
; RV32ZVE32F-NEXT: .LBB57_3:
-; RV32ZVE32F-NEXT: lw a6, 16(a3)
-; RV32ZVE32F-NEXT: lw a7, 20(a3)
-; RV32ZVE32F-NEXT: andi t1, t0, 8
+; RV32ZVE32F-NEXT: lw a7, 16(a3)
+; RV32ZVE32F-NEXT: lw t0, 20(a3)
+; RV32ZVE32F-NEXT: andi t1, a6, 8
; RV32ZVE32F-NEXT: bnez t1, .LBB57_10
; RV32ZVE32F-NEXT: .LBB57_4:
; RV32ZVE32F-NEXT: lw t1, 24(a3)
; RV32ZVE32F-NEXT: lw t2, 28(a3)
-; RV32ZVE32F-NEXT: andi t3, t0, 16
+; RV32ZVE32F-NEXT: andi t3, a6, 16
; RV32ZVE32F-NEXT: bnez t3, .LBB57_11
; RV32ZVE32F-NEXT: .LBB57_5:
; RV32ZVE32F-NEXT: lw t3, 32(a3)
; RV32ZVE32F-NEXT: lw t4, 36(a3)
-; RV32ZVE32F-NEXT: andi t5, t0, 32
+; RV32ZVE32F-NEXT: andi t5, a6, 32
; RV32ZVE32F-NEXT: bnez t5, .LBB57_12
; RV32ZVE32F-NEXT: .LBB57_6:
; RV32ZVE32F-NEXT: lw t5, 40(a3)
@@ -6626,7 +6626,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: .LBB57_7:
; RV32ZVE32F-NEXT: lw a1, 0(a3)
; RV32ZVE32F-NEXT: lw a2, 4(a3)
-; RV32ZVE32F-NEXT: andi a4, t0, 2
+; RV32ZVE32F-NEXT: andi a4, a6, 2
; RV32ZVE32F-NEXT: beqz a4, .LBB57_2
; RV32ZVE32F-NEXT: .LBB57_8: # %cond.load1
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma
@@ -6634,15 +6634,15 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: vmv.x.s a5, v10
; RV32ZVE32F-NEXT: lw a4, 0(a5)
; RV32ZVE32F-NEXT: lw a5, 4(a5)
-; RV32ZVE32F-NEXT: andi a6, t0, 4
-; RV32ZVE32F-NEXT: beqz a6, .LBB57_3
+; RV32ZVE32F-NEXT: andi a7, a6, 4
+; RV32ZVE32F-NEXT: beqz a7, .LBB57_3
; RV32ZVE32F-NEXT: .LBB57_9: # %cond.load4
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT: vmv.x.s a7, v10
-; RV32ZVE32F-NEXT: lw a6, 0(a7)
-; RV32ZVE32F-NEXT: lw a7, 4(a7)
-; RV32ZVE32F-NEXT: andi t1, t0, 8
+; RV32ZVE32F-NEXT: vmv.x.s t0, v10
+; RV32ZVE32F-NEXT: lw a7, 0(t0)
+; RV32ZVE32F-NEXT: lw t0, 4(t0)
+; RV32ZVE32F-NEXT: andi t1, a6, 8
; RV32ZVE32F-NEXT: beqz t1, .LBB57_4
; RV32ZVE32F-NEXT: .LBB57_10: # %cond.load7
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma
@@ -6650,7 +6650,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: vmv.x.s t2, v10
; RV32ZVE32F-NEXT: lw t1, 0(t2)
; RV32ZVE32F-NEXT: lw t2, 4(t2)
-; RV32ZVE32F-NEXT: andi t3, t0, 16
+; RV32ZVE32F-NEXT: andi t3, a6, 16
; RV32ZVE32F-NEXT: beqz t3, .LBB57_5
; RV32ZVE32F-NEXT: .LBB57_11: # %cond.load10
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
@@ -6658,7 +6658,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: vmv.x.s t4, v10
; RV32ZVE32F-NEXT: lw t3, 0(t4)
; RV32ZVE32F-NEXT: lw t4, 4(t4)
-; RV32ZVE32F-NEXT: andi t5, t0, 32
+; RV32ZVE32F-NEXT: andi t5, a6, 32
; RV32ZVE32F-NEXT: beqz t5, .LBB57_6
; RV32ZVE32F-NEXT: .LBB57_12: # %cond.load13
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
@@ -6673,7 +6673,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZVE32F-NEXT: .cfi_offset s0, -4
; RV32ZVE32F-NEXT: .cfi_offset s1, -8
-; RV32ZVE32F-NEXT: andi s0, t0, 64
+; RV32ZVE32F-NEXT: andi s0, a6, 64
; RV32ZVE32F-NEXT: beqz s0, .LBB57_16
; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
@@ -6681,30 +6681,30 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: vmv.x.s s1, v10
; RV32ZVE32F-NEXT: lw s0, 0(s1)
; RV32ZVE32F-NEXT: lw s1, 4(s1)
-; RV32ZVE32F-NEXT: andi t0, t0, -128
-; RV32ZVE32F-NEXT: bnez t0, .LBB57_17
+; RV32ZVE32F-NEXT: andi a6, a6, -128
+; RV32ZVE32F-NEXT: bnez a6, .LBB57_17
; RV32ZVE32F-NEXT: .LBB57_15:
-; RV32ZVE32F-NEXT: lw t0, 56(a3)
+; RV32ZVE32F-NEXT: lw a6, 56(a3)
; RV32ZVE32F-NEXT: lw a3, 60(a3)
; RV32ZVE32F-NEXT: j .LBB57_18
; RV32ZVE32F-NEXT: .LBB57_16:
; RV32ZVE32F-NEXT: lw s0, 48(a3)
; RV32ZVE32F-NEXT: lw s1, 52(a3)
-; RV32ZVE32F-NEXT: andi t0, t0, -128
-; RV32ZVE32F-NEXT: beqz t0, .LBB57_15
+; RV32ZVE32F-NEXT: andi a6, a6, -128
+; RV32ZVE32F-NEXT: beqz a6, .LBB57_15
; RV32ZVE32F-NEXT: .LBB57_17: # %cond.load19
; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7
; RV32ZVE32F-NEXT: vmv.x.s a3, v8
-; RV32ZVE32F-NEXT: lw t0, 0(a3)
+; RV32ZVE32F-NEXT: lw a6, 0(a3)
; RV32ZVE32F-NEXT: lw a3, 4(a3)
; RV32ZVE32F-NEXT: .LBB57_18: # %else20
; RV32ZVE32F-NEXT: sw a1, 0(a0)
; RV32ZVE32F-NEXT: sw a2, 4(a0)
; RV32ZVE32F-NEXT: sw a4, 8(a0)
; RV32ZVE32F-NEXT: sw a5, 12(a0)
-; RV32ZVE32F-NEXT: sw a6, 16(a0)
-; RV32ZVE32F-NEXT: sw a7, 20(a0)
+; RV32ZVE32F-NEXT: sw a7, 16(a0)
+; RV32ZVE32F-NEXT: sw t0, 20(a0)
; RV32ZVE32F-NEXT: sw t1, 24(a0)
; RV32ZVE32F-NEXT: sw t2, 28(a0)
; RV32ZVE32F-NEXT: sw t3, 32(a0)
@@ -6713,7 +6713,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV32ZVE32F-NEXT: sw t6, 44(a0)
; RV32ZVE32F-NEXT: sw s0, 48(a0)
; RV32ZVE32F-NEXT: sw s1, 52(a0)
-; RV32ZVE32F-NEXT: sw t0, 56(a0)
+; RV32ZVE32F-NEXT: sw a6, 56(a0)
; RV32ZVE32F-NEXT: sw a3, 60(a0)
; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
@@ -6726,89 +6726,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV64ZVE32F-LABEL: mgather_baseidx_v8i64:
; RV64ZVE32F: # %bb.0:
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a7, v0
-; RV64ZVE32F-NEXT: andi a4, a7, 1
+; RV64ZVE32F-NEXT: vmv.x.s a6, v0
+; RV64ZVE32F-NEXT: andi a4, a6, 1
; RV64ZVE32F-NEXT: beqz a4, .LBB57_9
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: ld a4, 0(a2)
; RV64ZVE32F-NEXT: slli a4, a4, 3
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
-; RV64ZVE32F-NEXT: andi a5, a7, 2
+; RV64ZVE32F-NEXT: andi a5, a6, 2
; RV64ZVE32F-NEXT: bnez a5, .LBB57_10
; RV64ZVE32F-NEXT: .LBB57_2:
; RV64ZVE32F-NEXT: ld a5, 8(a3)
-; RV64ZVE32F-NEXT: andi a6, a7, 4
-; RV64ZVE32F-NEXT: bnez a6, .LBB57_11
+; RV64ZVE32F-NEXT: andi a7, a6, 4
+; RV64ZVE32F-NEXT: bnez a7, .LBB57_11
; RV64ZVE32F-NEXT: .LBB57_3:
-; RV64ZVE32F-NEXT: ld a6, 16(a3)
-; RV64ZVE32F-NEXT: andi t0, a7, 8
+; RV64ZVE32F-NEXT: ld a7, 16(a3)
+; RV64ZVE32F-NEXT: andi t0, a6, 8
; RV64ZVE32F-NEXT: bnez t0, .LBB57_12
; RV64ZVE32F-NEXT: .LBB57_4:
; RV64ZVE32F-NEXT: ld t0, 24(a3)
-; RV64ZVE32F-NEXT: andi t1, a7, 16
+; RV64ZVE32F-NEXT: andi t1, a6, 16
; RV64ZVE32F-NEXT: bnez t1, .LBB57_13
; RV64ZVE32F-NEXT: .LBB57_5:
; RV64ZVE32F-NEXT: ld t1, 32(a3)
-; RV64ZVE32F-NEXT: andi t2, a7, 32
+; RV64ZVE32F-NEXT: andi t2, a6, 32
; RV64ZVE32F-NEXT: bnez t2, .LBB57_14
; RV64ZVE32F-NEXT: .LBB57_6:
; RV64ZVE32F-NEXT: ld t2, 40(a3)
-; RV64ZVE32F-NEXT: andi t3, a7, 64
+; RV64ZVE32F-NEXT: andi t3, a6, 64
; RV64ZVE32F-NEXT: bnez t3, .LBB57_15
; RV64ZVE32F-NEXT: .LBB57_7:
; RV64ZVE32F-NEXT: ld t3, 48(a3)
-; RV64ZVE32F-NEXT: andi a7, a7, -128
-; RV64ZVE32F-NEXT: bnez a7, .LBB57_16
+; RV64ZVE32F-NEXT: andi a6, a6, -128
+; RV64ZVE32F-NEXT: bnez a6, .LBB57_16
; RV64ZVE32F-NEXT: .LBB57_8:
; RV64ZVE32F-NEXT: ld a1, 56(a3)
; RV64ZVE32F-NEXT: j .LBB57_17
; RV64ZVE32F-NEXT: .LBB57_9:
; RV64ZVE32F-NEXT: ld a4, 0(a3)
-; RV64ZVE32F-NEXT: andi a5, a7, 2
+; RV64ZVE32F-NEXT: andi a5, a6, 2
; RV64ZVE32F-NEXT: beqz a5, .LBB57_2
; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1
; RV64ZVE32F-NEXT: ld a5, 8(a2)
; RV64ZVE32F-NEXT: slli a5, a5, 3
; RV64ZVE32F-NEXT: add a5, a1, a5
; RV64ZVE32F-NEXT: ld a5, 0(a5)
-; RV64ZVE32F-NEXT: andi a6, a7, 4
-; RV64ZVE32F-NEXT: beqz a6, .LBB57_3
+; RV64ZVE32F-NEXT: andi a7, a6, 4
+; RV64ZVE32F-NEXT: beqz a7, .LBB57_3
; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4
-; RV64ZVE32F-NEXT: ld a6, 16(a2)
-; RV64ZVE32F-NEXT: slli a6, a6, 3
-; RV64ZVE32F-NEXT: add a6, a1, a6
-; RV64ZVE32F-NEXT: ld a6, 0(a6)
-; RV64ZVE32F-NEXT: andi t0, a7, 8
+; RV64ZVE32F-NEXT: ld a7, 16(a2)
+; RV64ZVE32F-NEXT: slli a7, a7, 3
+; RV64ZVE32F-NEXT: add a7, a1, a7
+; RV64ZVE32F-NEXT: ld a7, 0(a7)
+; RV64ZVE32F-NEXT: andi t0, a6, 8
; RV64ZVE32F-NEXT: beqz t0, .LBB57_4
; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7
; RV64ZVE32F-NEXT: ld t0, 24(a2)
; RV64ZVE32F-NEXT: slli t0, t0, 3
; RV64ZVE32F-NEXT: add t0, a1, t0
; RV64ZVE32F-NEXT: ld t0, 0(t0)
-; RV64ZVE32F-NEXT: andi t1, a7, 16
+; RV64ZVE32F-NEXT: andi t1, a6, 16
; RV64ZVE32F-NEXT: beqz t1, .LBB57_5
; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10
; RV64ZVE32F-NEXT: ld t1, 32(a2)
; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
-; RV64ZVE32F-NEXT: andi t2, a7, 32
+; RV64ZVE32F-NEXT: andi t2, a6, 32
; RV64ZVE32F-NEXT: beqz t2, .LBB57_6
; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13
; RV64ZVE32F-NEXT: ld t2, 40(a2)
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
-; RV64ZVE32F-NEXT: andi t3, a7, 64
+; RV64ZVE32F-NEXT: andi t3, a6, 64
; RV64ZVE32F-NEXT: beqz t3, .LBB57_7
; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16
; RV64ZVE32F-NEXT: ld t3, 48(a2)
; RV64ZVE32F-NEXT: slli t3, t3, 3
; RV64ZVE32F-NEXT: add t3, a1, t3
; RV64ZVE32F-NEXT: ld t3, 0(t3)
-; RV64ZVE32F-NEXT: andi a7, a7, -128
-; RV64ZVE32F-NEXT: beqz a7, .LBB57_8
+; RV64ZVE32F-NEXT: andi a6, a6, -128
+; RV64ZVE32F-NEXT: beqz a6, .LBB57_8
; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19
; RV64ZVE32F-NEXT: ld a2, 56(a2)
; RV64ZVE32F-NEXT: slli a2, a2, 3
@@ -6817,7 +6817,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
; RV64ZVE32F-NEXT: .LBB57_17: # %else20
; RV64ZVE32F-NEXT: sd a4, 0(a0)
; RV64ZVE32F-NEXT: sd a5, 8(a0)
-; RV64ZVE32F-NEXT: sd a6, 16(a0)
+; RV64ZVE32F-NEXT: sd a7, 16(a0)
; RV64ZVE32F-NEXT: sd t0, 24(a0)
; RV64ZVE32F-NEXT: sd t1, 32(a0)
; RV64ZVE32F-NEXT: sd t2, 40(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 07aa05f609c40..55729a680c294 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -897,55 +897,54 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
; CHECK-NEXT: beq a2, a3, .LBB14_7
; CHECK-NEXT: # %bb.1: # %bb3
; CHECK-NEXT: li a3, 1023
-; CHECK-NEXT: subw a5, a3, a2
-; CHECK-NEXT: li a6, 31
-; CHECK-NEXT: mv a4, a2
-; CHECK-NEXT: bltu a5, a6, .LBB14_5
+; CHECK-NEXT: subw a4, a3, a2
+; CHECK-NEXT: li a5, 31
+; CHECK-NEXT: bltu a4, a5, .LBB14_5
; CHECK-NEXT: # %bb.2: # %bb9
-; CHECK-NEXT: slli a4, a5, 32
-; CHECK-NEXT: slli t0, a2, 2
-; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: slli a5, a4, 32
+; CHECK-NEXT: slli a7, a2, 2
+; CHECK-NEXT: add a4, a0, a2
; CHECK-NEXT: add a6, a1, a2
; CHECK-NEXT: li t2, 32
-; CHECK-NEXT: srli a4, a4, 32
-; CHECK-NEXT: add t0, a6, t0
-; CHECK-NEXT: addi a6, a4, 1
-; CHECK-NEXT: andi a7, a6, -32
-; CHECK-NEXT: add a4, a7, a2
-; CHECK-NEXT: add a2, a4, a0
+; CHECK-NEXT: srli a5, a5, 32
+; CHECK-NEXT: add a7, a6, a7
+; CHECK-NEXT: addi a5, a5, 1
+; CHECK-NEXT: andi a6, a5, -32
+; CHECK-NEXT: add a2, a6, a2
+; CHECK-NEXT: add t0, a2, a0
; CHECK-NEXT: li t1, 5
; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma
; CHECK-NEXT: .LBB14_3: # %bb15
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vlse8.v v8, (t0), t1
-; CHECK-NEXT: vle8.v v9, (a5)
+; CHECK-NEXT: vlse8.v v8, (a7), t1
+; CHECK-NEXT: vle8.v v9, (a4)
; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vse8.v v8, (a5)
-; CHECK-NEXT: addi a5, a5, 32
-; CHECK-NEXT: addi t0, t0, 160
-; CHECK-NEXT: bne a5, a2, .LBB14_3
+; CHECK-NEXT: vse8.v v8, (a4)
+; CHECK-NEXT: addi a4, a4, 32
+; CHECK-NEXT: addi a7, a7, 160
+; CHECK-NEXT: bne a4, t0, .LBB14_3
; CHECK-NEXT: # %bb.4: # %bb30
-; CHECK-NEXT: beq a6, a7, .LBB14_7
+; CHECK-NEXT: beq a5, a6, .LBB14_7
; CHECK-NEXT: .LBB14_5: # %bb32
-; CHECK-NEXT: add a2, a0, a4
-; CHECK-NEXT: slli a5, a4, 2
-; CHECK-NEXT: add a1, a1, a4
-; CHECK-NEXT: subw a3, a3, a4
+; CHECK-NEXT: add a4, a0, a2
+; CHECK-NEXT: slli a5, a2, 2
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: subw a3, a3, a2
; CHECK-NEXT: add a1, a1, a5
; CHECK-NEXT: slli a3, a3, 32
; CHECK-NEXT: srli a3, a3, 32
-; CHECK-NEXT: add a0, a4, a0
+; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: addi a0, a0, 1
; CHECK-NEXT: .LBB14_6: # %bb35
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lbu a3, 0(a1)
-; CHECK-NEXT: lbu a4, 0(a2)
-; CHECK-NEXT: add a3, a4, a3
-; CHECK-NEXT: sb a3, 0(a2)
-; CHECK-NEXT: addi a2, a2, 1
+; CHECK-NEXT: lbu a2, 0(a1)
+; CHECK-NEXT: lbu a3, 0(a4)
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sb a2, 0(a4)
+; CHECK-NEXT: addi a4, a4, 1
; CHECK-NEXT: addi a1, a1, 5
-; CHECK-NEXT: bne a2, a0, .LBB14_6
+; CHECK-NEXT: bne a4, a0, .LBB14_6
; CHECK-NEXT: .LBB14_7: # %bb34
; CHECK-NEXT: ret
bb:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index 08cab7cd359b9..a9d926b56386a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -29,32 +29,30 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV32-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader
; RV32-NEXT: li t0, 32
; RV32-NEXT: # %bb.4: # %for.cond1.preheader.us.preheader
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
-; RV32-NEXT: .cfi_offset s2, -12
-; RV32-NEXT: .cfi_remember_state
; RV32-NEXT: add t3, a0, t3
; RV32-NEXT: add t4, a2, t4
-; RV32-NEXT: add s0, a4, t5
+; RV32-NEXT: add t5, a4, t5
; RV32-NEXT: bltu t6, t1, .LBB0_6
; RV32-NEXT: # %bb.5: # %for.cond1.preheader.us.preheader
; RV32-NEXT: li t1, 32
; RV32-NEXT: .LBB0_6: # %for.cond1.preheader.us.preheader
; RV32-NEXT: add t3, t3, a6
-; RV32-NEXT: add t5, t4, a6
-; RV32-NEXT: add t4, s0, a6
+; RV32-NEXT: add t6, t4, a6
+; RV32-NEXT: add t4, t5, a6
; RV32-NEXT: j .LBB0_8
; RV32-NEXT: # %bb.7: # %for.cond1.preheader.us.preheader
; RV32-NEXT: mv t1, t0
; RV32-NEXT: .LBB0_8: # %for.cond1.preheader.us.preheader
-; RV32-NEXT: .cfi_restore_state
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
+; RV32-NEXT: .cfi_offset s2, -12
; RV32-NEXT: li t0, 0
-; RV32-NEXT: sltu t5, a0, t5
+; RV32-NEXT: sltu t5, a0, t6
; RV32-NEXT: sltu t6, a2, t3
; RV32-NEXT: and t5, t5, t6
; RV32-NEXT: sltu t4, a0, t4
diff --git a/llvm/test/CodeGen/RISCV/xcvbi.ll b/llvm/test/CodeGen/RISCV/xcvbi.ll
index ca2e416e334f0..d5d11585970b0 100644
--- a/llvm/test/CodeGen/RISCV/xcvbi.ll
+++ b/llvm/test/CodeGen/RISCV/xcvbi.ll
@@ -67,14 +67,14 @@ define i32 @select_beqimm_1(i32 %a, i32 %x, i32 %y) {
; CHECK_NOPT: # %bb.0: # %entry
; CHECK_NOPT-NEXT: addi sp, sp, -16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: cv.beqimm a0, -16, .LBB2_2
; CHECK_NOPT-NEXT: # %bb.1: # %entry
-; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: .LBB2_2: # %entry
; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: .LBB2_2: # %entry
+; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; CHECK_NOPT-NEXT: addi sp, sp, 16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0
; CHECK_NOPT-NEXT: ret
@@ -98,14 +98,14 @@ define i32 @select_beqimm_2(i32 %a, i32 %x, i32 %y) {
; CHECK_NOPT: # %bb.0: # %entry
; CHECK_NOPT-NEXT: addi sp, sp, -16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: cv.beqimm a0, 0, .LBB3_2
; CHECK_NOPT-NEXT: # %bb.1: # %entry
-; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: .LBB3_2: # %entry
; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: .LBB3_2: # %entry
+; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; CHECK_NOPT-NEXT: addi sp, sp, 16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0
; CHECK_NOPT-NEXT: ret
@@ -129,14 +129,14 @@ define i32 @select_beqimm_3(i32 %a, i32 %x, i32 %y) {
; CHECK_NOPT: # %bb.0: # %entry
; CHECK_NOPT-NEXT: addi sp, sp, -16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: cv.beqimm a0, 15, .LBB4_2
; CHECK_NOPT-NEXT: # %bb.1: # %entry
-; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: .LBB4_2: # %entry
; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: .LBB4_2: # %entry
+; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; CHECK_NOPT-NEXT: addi sp, sp, 16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0
; CHECK_NOPT-NEXT: ret
@@ -160,15 +160,15 @@ define i32 @select_no_beqimm_1(i32 %a, i32 %x, i32 %y) {
; CHECK_NOPT: # %bb.0: # %entry
; CHECK_NOPT-NEXT: addi sp, sp, -16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: li a1, -17
-; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: beq a0, a1, .LBB5_2
; CHECK_NOPT-NEXT: # %bb.1: # %entry
-; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: .LBB5_2: # %entry
; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: .LBB5_2: # %entry
+; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; CHECK_NOPT-NEXT: addi sp, sp, 16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0
; CHECK_NOPT-NEXT: ret
@@ -193,15 +193,15 @@ define i32 @select_no_beqimm_2(i32 %a, i32 %x, i32 %y) {
; CHECK_NOPT: # %bb.0: # %entry
; CHECK_NOPT-NEXT: addi sp, sp, -16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: li a1, 16
-; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: beq a0, a1, .LBB6_2
; CHECK_NOPT-NEXT: # %bb.1: # %entry
-; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: .LBB6_2: # %entry
; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: .LBB6_2: # %entry
+; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; CHECK_NOPT-NEXT: addi sp, sp, 16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0
; CHECK_NOPT-NEXT: ret
@@ -226,14 +226,14 @@ define i32 @select_bneimm_1(i32 %a, i32 %x, i32 %y) {
; CHECK_NOPT: # %bb.0: # %entry
; CHECK_NOPT-NEXT: addi sp, sp, -16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a2, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
; CHECK_NOPT-NEXT: cv.bneimm a0, 0, .LBB7_2
; CHECK_NOPT-NEXT: # %bb.1: # %entry
-; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT: .LBB7_2: # %entry
; CHECK_NOPT-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT: .LBB7_2: # %entry
+; CHECK_NOPT-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; CHECK_NOPT-NEXT: addi sp, sp, 16
; CHECK_NOPT-NEXT: .cfi_def_cfa_offset 0
; CHECK_NOPT-NEXT: ret
diff --git a/llvm/test/CodeGen/SystemZ/swifterror.ll b/llvm/test/CodeGen/SystemZ/swifterror.ll
index 1b18287cac146..39f0907295ff4 100644
--- a/llvm/test/CodeGen/SystemZ/swifterror.ll
+++ b/llvm/test/CodeGen/SystemZ/swifterror.ll
@@ -162,8 +162,8 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
; CHECK-O0: je
; CHECK-O0: lghi %r2, 16
; CHECK-O0: brasl %r14, malloc
-; CHECK-O0: lgr %r[[REG1:[0-9]+]], %r2
-; CHECK-O0: mvi 8(%r[[REG1]]), 1
+; CHECK-O0: lgr %r{{[0-9]+}}, %r2
+; CHECK-O0: mvi 8(%r2), 1
; CHECK-O0: jnh
; reload from stack
; CHECK-O0: lg %r9, [[OFFS:[0-9]+]](%r15)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 8a5a15a57912c..08b99c67d9d55 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -328,14 +328,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
; CHECK-LABEL: test_vec_mul_scalar_add_char:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: ldr r4, [sp, #28]
-; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: ldr.w r12, [sp, #28]
+; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq.w .LBB5_11
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: adds r7, r1, r4
-; CHECK-NEXT: add.w r6, r3, r4, lsl #2
+; CHECK-NEXT: add.w r7, r1, r12
+; CHECK-NEXT: add.w r6, r3, r12, lsl #2
; CHECK-NEXT: cmp r7, r3
-; CHECK-NEXT: add.w r5, r0, r4
+; CHECK-NEXT: add.w r5, r0, r12
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r6, r1
; CHECK-NEXT: csel r7, zr, r7, ls
@@ -348,15 +348,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
; CHECK-NEXT: cmpeq r7, #0
; CHECK-NEXT: beq .LBB5_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: and r12, r4, #3
-; CHECK-NEXT: subs r7, r4, #1
+; CHECK-NEXT: and r8, r12, #3
+; CHECK-NEXT: sub.w r7, r12, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB5_6
; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB5_8
; CHECK-NEXT: .LBB5_4: @ %vector.ph
-; CHECK-NEXT: dlstp.32 lr, r4
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB5_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u32 q0, [r0], #4
@@ -366,18 +366,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
; CHECK-NEXT: letp lr, .LBB5_5
; CHECK-NEXT: b .LBB5_11
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
-; CHECK-NEXT: bic r7, r4, #3
+; CHECK-NEXT: bic r7, r12, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: add.w r5, r3, #8
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: adds r6, r0, #3
; CHECK-NEXT: adds r7, r1, #1
; CHECK-NEXT: .LBB5_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r9, [r6, #-3]
-; CHECK-NEXT: add.w r8, r8, #4
+; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldrb r4, [r7, #-1]
; CHECK-NEXT: smlabb r4, r4, r9, r2
; CHECK-NEXT: str r4, [r5, #-8]
@@ -396,11 +396,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: le lr, .LBB5_7
; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r12, .LBB5_11
+; CHECK-NEXT: wls lr, r8, .LBB5_11
; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
-; CHECK-NEXT: add r0, r8
-; CHECK-NEXT: add r1, r8
-; CHECK-NEXT: add.w r3, r3, r8, lsl #2
+; CHECK-NEXT: add r0, r12
+; CHECK-NEXT: add r1, r12
+; CHECK-NEXT: add.w r3, r3, r12, lsl #2
; CHECK-NEXT: .LBB5_10: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r7, [r0], #1
@@ -604,14 +604,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
; CHECK-LABEL: test_vec_mul_scalar_add_uchar:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: ldr r4, [sp, #28]
-; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: ldr.w r12, [sp, #28]
+; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq.w .LBB7_11
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: adds r7, r1, r4
-; CHECK-NEXT: add.w r6, r3, r4, lsl #2
+; CHECK-NEXT: add.w r7, r1, r12
+; CHECK-NEXT: add.w r6, r3, r12, lsl #2
; CHECK-NEXT: cmp r7, r3
-; CHECK-NEXT: add.w r5, r0, r4
+; CHECK-NEXT: add.w r5, r0, r12
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r6, r1
; CHECK-NEXT: csel r7, zr, r7, ls
@@ -624,15 +624,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
; CHECK-NEXT: cmpeq r7, #0
; CHECK-NEXT: beq .LBB7_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: and r12, r4, #3
-; CHECK-NEXT: subs r7, r4, #1
+; CHECK-NEXT: and r8, r12, #3
+; CHECK-NEXT: sub.w r7, r12, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB7_6
; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB7_8
; CHECK-NEXT: .LBB7_4: @ %vector.ph
-; CHECK-NEXT: dlstp.32 lr, r4
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB7_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u32 q0, [r0], #4
@@ -642,18 +642,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
; CHECK-NEXT: letp lr, .LBB7_5
; CHECK-NEXT: b .LBB7_11
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
-; CHECK-NEXT: bic r7, r4, #3
+; CHECK-NEXT: bic r7, r12, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: add.w r5, r3, #8
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: adds r6, r0, #3
; CHECK-NEXT: adds r7, r1, #1
; CHECK-NEXT: .LBB7_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r9, [r6, #-3]
-; CHECK-NEXT: add.w r8, r8, #4
+; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldrb r4, [r7, #-1]
; CHECK-NEXT: smlabb r4, r4, r9, r2
; CHECK-NEXT: str r4, [r5, #-8]
@@ -672,11 +672,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: le lr, .LBB7_7
; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r12, .LBB7_11
+; CHECK-NEXT: wls lr, r8, .LBB7_11
; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
-; CHECK-NEXT: add r0, r8
-; CHECK-NEXT: add r1, r8
-; CHECK-NEXT: add.w r3, r3, r8, lsl #2
+; CHECK-NEXT: add r0, r12
+; CHECK-NEXT: add r1, r12
+; CHECK-NEXT: add.w r3, r3, r12, lsl #2
; CHECK-NEXT: .LBB7_10: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r7, [r0], #1
@@ -880,14 +880,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
; CHECK-LABEL: test_vec_mul_scalar_add_int:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: ldr r4, [sp, #28]
-; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: ldr.w r12, [sp, #28]
+; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq.w .LBB9_11
; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
-; CHECK-NEXT: add.w r7, r1, r4, lsl #2
-; CHECK-NEXT: add.w r6, r3, r4, lsl #2
+; CHECK-NEXT: add.w r7, r1, r12, lsl #2
+; CHECK-NEXT: add.w r6, r3, r12, lsl #2
; CHECK-NEXT: cmp r7, r3
-; CHECK-NEXT: add.w r5, r0, r4, lsl #2
+; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r6, r1
; CHECK-NEXT: csel r7, zr, r7, ls
@@ -900,15 +900,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
; CHECK-NEXT: cmpeq r7, #0
; CHECK-NEXT: beq .LBB9_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: and r12, r4, #3
-; CHECK-NEXT: subs r7, r4, #1
+; CHECK-NEXT: and r8, r12, #3
+; CHECK-NEXT: sub.w r7, r12, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB9_6
; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB9_8
; CHECK-NEXT: .LBB9_4: @ %vector.ph
-; CHECK-NEXT: dlstp.32 lr, r4
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
@@ -918,18 +918,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
; CHECK-NEXT: letp lr, .LBB9_5
; CHECK-NEXT: b .LBB9_11
; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new
-; CHECK-NEXT: bic r7, r4, #3
+; CHECK-NEXT: bic r7, r12, #3
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: add.w r5, r3, #8
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
; CHECK-NEXT: add.w r6, r0, #8
; CHECK-NEXT: add.w r7, r1, #8
; CHECK-NEXT: .LBB9_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r9, [r6, #-8]
-; CHECK-NEXT: add.w r8, r8, #4
+; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldr r4, [r7, #-8]
; CHECK-NEXT: mla r4, r4, r9, r2
; CHECK-NEXT: str r4, [r5, #-8]
@@ -950,11 +950,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: le lr, .LBB9_7
; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r12, .LBB9_11
+; CHECK-NEXT: wls lr, r8, .LBB9_11
; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
-; CHECK-NEXT: add.w r0, r0, r8, lsl #2
-; CHECK-NEXT: add.w r1, r1, r8, lsl #2
-; CHECK-NEXT: add.w r3, r3, r8, lsl #2
+; CHECK-NEXT: add.w r0, r0, r12, lsl #2
+; CHECK-NEXT: add.w r1, r1, r12, lsl #2
+; CHECK-NEXT: add.w r3, r3, r12, lsl #2
; CHECK-NEXT: .LBB9_10: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r7, [r0], #4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
index a0e690212d5a4..7acc83343dcb8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@@ -17,18 +17,18 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt
; ENABLED-NEXT: .LBB0_2: @ %vector.ph
; ENABLED-NEXT: @ =>This Loop Header: Depth=1
; ENABLED-NEXT: @ Child Loop BB0_3 Depth 2
-; ENABLED-NEXT: mov r12, r0
-; ENABLED-NEXT: mov r4, r2
-; ENABLED-NEXT: mov r5, r1
-; ENABLED-NEXT: mov r6, r3
-; ENABLED-NEXT: dlstp.32 lr, r6
+; ENABLED-NEXT: mov r4, r0
+; ENABLED-NEXT: mov r5, r2
+; ENABLED-NEXT: mov r6, r1
+; ENABLED-NEXT: mov r7, r3
+; ENABLED-NEXT: dlstp.32 lr, r7
; ENABLED-NEXT: .LBB0_3: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_2 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
-; ENABLED-NEXT: vldrw.u32 q0, [r5], #16
-; ENABLED-NEXT: vldrw.u32 q1, [r4], #16
+; ENABLED-NEXT: vldrw.u32 q0, [r6], #16
+; ENABLED-NEXT: vldrw.u32 q1, [r5], #16
; ENABLED-NEXT: vadd.i32 q0, q1, q0
-; ENABLED-NEXT: vstrw.32 q0, [r12], #16
+; ENABLED-NEXT: vstrw.32 q0, [r4], #16
; ENABLED-NEXT: letp lr, .LBB0_3
; ENABLED-NEXT: b .LBB0_2
; ENABLED-NEXT: .LBB0_4: @ %for.cond.cleanup
@@ -44,29 +44,29 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt
; DISABLED-NEXT: movs r6, #1
; DISABLED-NEXT: bic r7, r7, #3
; DISABLED-NEXT: subs r7, #4
-; DISABLED-NEXT: add.w r8, r6, r7, lsr #2
+; DISABLED-NEXT: add.w r12, r6, r7, lsr #2
; DISABLED-NEXT: .LBB0_2: @ %vector.ph
; DISABLED-NEXT: @ =>This Loop Header: Depth=1
; DISABLED-NEXT: @ Child Loop BB0_3 Depth 2
-; DISABLED-NEXT: mov r7, r8
-; DISABLED-NEXT: mov r12, r0
-; DISABLED-NEXT: mov r4, r2
-; DISABLED-NEXT: mov r5, r1
-; DISABLED-NEXT: mov r6, r3
-; DISABLED-NEXT: dls lr, r8
+; DISABLED-NEXT: mov r8, r12
+; DISABLED-NEXT: mov r4, r0
+; DISABLED-NEXT: mov r5, r2
+; DISABLED-NEXT: mov r6, r1
+; DISABLED-NEXT: mov r7, r3
+; DISABLED-NEXT: dls lr, r12
; DISABLED-NEXT: .LBB0_3: @ %vector.body
; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1
; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2
-; DISABLED-NEXT: vctp.32 r6
-; DISABLED-NEXT: mov lr, r7
+; DISABLED-NEXT: vctp.32 r7
+; DISABLED-NEXT: mov lr, r8
; DISABLED-NEXT: vpstt
-; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16
-; DISABLED-NEXT: vldrwt.u32 q1, [r4], #16
-; DISABLED-NEXT: subs r7, #1
-; DISABLED-NEXT: subs r6, #4
+; DISABLED-NEXT: vldrwt.u32 q0, [r6], #16
+; DISABLED-NEXT: vldrwt.u32 q1, [r5], #16
+; DISABLED-NEXT: sub.w r8, r8, #1
+; DISABLED-NEXT: subs r7, #4
; DISABLED-NEXT: vadd.i32 q0, q1, q0
; DISABLED-NEXT: vpst
-; DISABLED-NEXT: vstrwt.32 q0, [r12], #16
+; DISABLED-NEXT: vstrwt.32 q0, [r4], #16
; DISABLED-NEXT: le lr, .LBB0_3
; DISABLED-NEXT: b .LBB0_2
; DISABLED-NEXT: .LBB0_4: @ %for.cond.cleanup
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 07c06e10979cd..736d5956b6194 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -29,7 +29,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: mov.w r8, #0
; ENABLED-NEXT: mov r9, r12
; ENABLED-NEXT: uxth r0, r0
-; ENABLED-NEXT: rsbs r5, r0, #0
+; ENABLED-NEXT: rsbs r6, r0, #0
; ENABLED-NEXT: b .LBB0_4
; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: movs r0, #0
@@ -52,9 +52,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: bic r0, r9, #3
; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: sub.w r4, r2, r8
+; ENABLED-NEXT: sub.w r5, r2, r8
; ENABLED-NEXT: vmov.i32 q1, #0x0
-; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
+; ENABLED-NEXT: add.w r4, r7, r0, lsr #2
; ENABLED-NEXT: sub.w r0, r12, r8
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
@@ -65,16 +65,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
-; ENABLED-NEXT: vctp.32 r4
+; ENABLED-NEXT: vctp.32 r5
; ENABLED-NEXT: vmov q0, q1
; ENABLED-NEXT: vpstt
; ENABLED-NEXT: vldrht.s32 q1, [r0], #8
; ENABLED-NEXT: vldrht.s32 q2, [r7], #8
-; ENABLED-NEXT: mov lr, r6
-; ENABLED-NEXT: subs r6, #1
+; ENABLED-NEXT: mov lr, r4
+; ENABLED-NEXT: subs r4, #1
; ENABLED-NEXT: vmul.i32 q1, q2, q1
-; ENABLED-NEXT: subs r4, #4
-; ENABLED-NEXT: vshl.s32 q1, r5
+; ENABLED-NEXT: subs r5, #4
+; ENABLED-NEXT: vshl.s32 q1, r6
; ENABLED-NEXT: vadd.i32 q1, q1, q0
; ENABLED-NEXT: le lr, .LBB0_6
; ENABLED-NEXT: @ %bb.7: @ %middle.block
@@ -100,7 +100,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: mov.w r8, #0
; NOREDUCTIONS-NEXT: mov r9, r12
; NOREDUCTIONS-NEXT: uxth r0, r0
-; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
+; NOREDUCTIONS-NEXT: rsbs r6, r0, #0
; NOREDUCTIONS-NEXT: b .LBB0_4
; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: movs r0, #0
@@ -123,9 +123,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: bic r0, r9, #3
; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: sub.w r4, r2, r8
+; NOREDUCTIONS-NEXT: sub.w r5, r2, r8
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT: add.w r4, r7, r0, lsr #2
; NOREDUCTIONS-NEXT: sub.w r0, r12, r8
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
@@ -136,16 +136,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
-; NOREDUCTIONS-NEXT: vctp.32 r4
+; NOREDUCTIONS-NEXT: vctp.32 r5
; NOREDUCTIONS-NEXT: vmov q0, q1
; NOREDUCTIONS-NEXT: vpstt
; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8
; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8
-; NOREDUCTIONS-NEXT: mov lr, r6
-; NOREDUCTIONS-NEXT: subs r6, #1
+; NOREDUCTIONS-NEXT: mov lr, r4
+; NOREDUCTIONS-NEXT: subs r4, #1
; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1
-; NOREDUCTIONS-NEXT: subs r4, #4
-; NOREDUCTIONS-NEXT: vshl.s32 q1, r5
+; NOREDUCTIONS-NEXT: subs r5, #4
+; NOREDUCTIONS-NEXT: vshl.s32 q1, r6
; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0
; NOREDUCTIONS-NEXT: le lr, .LBB0_6
; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index cbcbf1f392ce8..a6a9361050731 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,74 +165,69 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: wls lr, r1, .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: adds r6, r3, #4
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: mvn r8, #1
-; CHECK-NEXT: @ implicit-def: $r9
-; CHECK-NEXT: @ implicit-def: $r4
+; CHECK-NEXT: add.w r12, r3, #4
+; CHECK-NEXT: add.w r9, r0, #4
+; CHECK-NEXT: mvn r10, #1
+; CHECK-NEXT: @ implicit-def: $r6
+; CHECK-NEXT: @ implicit-def: $r8
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: asrs r2, r4, #31
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: ldr.w r1, [r9]
+; CHECK-NEXT: asr.w r2, r8, #31
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: muls r1, r3, r1
-; CHECK-NEXT: adds r4, r4, r1
+; CHECK-NEXT: adds.w r5, r8, r1
; CHECK-NEXT: adc.w r1, r2, r1, asr #31
-; CHECK-NEXT: adds.w r2, r4, #-2147483648
-; CHECK-NEXT: ldrd r2, r4, [r8]
+; CHECK-NEXT: adds.w r2, r5, #-2147483648
+; CHECK-NEXT: ldrd r2, r5, [r10]
+; CHECK-NEXT: adc r8, r1, #0
+; CHECK-NEXT: asr.w r1, r8, #31
+; CHECK-NEXT: strd r6, r2, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: smull r5, r6, r5, r6
+; CHECK-NEXT: subs.w r5, r8, r5
+; CHECK-NEXT: sbcs r1, r6
+; CHECK-NEXT: adds.w r6, r5, #-2147483648
; CHECK-NEXT: adc r5, r1, #0
-; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: smull r4, r2, r4, r9
-; CHECK-NEXT: asrs r1, r5, #31
-; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: subs r4, r5, r4
-; CHECK-NEXT: sbcs r1, r2
-; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds.w r10, r4, #-2147483648
-; CHECK-NEXT: adc r1, r1, #0
-; CHECK-NEXT: ldr r4, [r2, #-4]
-; CHECK-NEXT: muls r4, r3, r4
-; CHECK-NEXT: adds r3, #4
-; CHECK-NEXT: adds.w r12, r4, #-2147483648
-; CHECK-NEXT: asr.w r5, r4, #31
-; CHECK-NEXT: ldr r4, [r6]
-; CHECK-NEXT: adc r5, r5, #0
-; CHECK-NEXT: mul r2, r4, r0
+; CHECK-NEXT: ldr r1, [r9, #-4]
+; CHECK-NEXT: add.w r9, r9, #4
+; CHECK-NEXT: muls r1, r3, r1
+; CHECK-NEXT: adds.w r2, r1, #-2147483648
+; CHECK-NEXT: asr.w r4, r1, #31
+; CHECK-NEXT: ldr.w r1, [r12]
+; CHECK-NEXT: adc r3, r4, #0
+; CHECK-NEXT: mul r4, r1, r0
; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: add.w r2, r2, #-2147483648
-; CHECK-NEXT: asrl r12, r5, r2
-; CHECK-NEXT: smull r2, r5, r4, r12
-; CHECK-NEXT: lsll r2, r5, #30
-; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: asr.w r11, r5, #31
-; CHECK-NEXT: mov r12, r5
-; CHECK-NEXT: lsll r12, r11, r4
-; CHECK-NEXT: mul r2, r2, r9
-; CHECK-NEXT: lsrl r12, r11, #2
-; CHECK-NEXT: adds r2, #2
-; CHECK-NEXT: lsll r12, r11, r2
+; CHECK-NEXT: add.w r4, r4, #-2147483648
+; CHECK-NEXT: asrl r2, r3, r4
+; CHECK-NEXT: smull r2, r3, r1, r2
+; CHECK-NEXT: lsll r2, r3, #30
+; CHECK-NEXT: asr.w r11, r3, #31
+; CHECK-NEXT: mov r4, r3
+; CHECK-NEXT: ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT: lsll r4, r11, r1
+; CHECK-NEXT: lsrl r4, r11, #2
+; CHECK-NEXT: muls r3, r2, r3
+; CHECK-NEXT: adds r3, #2
+; CHECK-NEXT: lsll r4, r11, r3
+; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r2, r4, #-2147483648
+; CHECK-NEXT: asrl r6, r5, r2
+; CHECK-NEXT: movs r2, #2
+; CHECK-NEXT: lsrl r6, r5, #2
+; CHECK-NEXT: adds r3, #4
+; CHECK-NEXT: str r6, [r2]
+; CHECK-NEXT: ldr r2, [r10], #-4
+; CHECK-NEXT: mls r4, r2, r1, r8
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r12, #-2147483648
-; CHECK-NEXT: asrl r10, r1, r5
-; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: lsrl r10, r1, #2
-; CHECK-NEXT: movs r1, #2
-; CHECK-NEXT: mov r9, r10
-; CHECK-NEXT: str.w r10, [r1]
-; CHECK-NEXT: ldr r1, [r8], #-4
-; CHECK-NEXT: mls r5, r1, r4, r5
-; CHECK-NEXT: adds.w r4, r5, #-2147483648
-; CHECK-NEXT: asr.w r1, r5, #31
+; CHECK-NEXT: adds.w r8, r4, #-2147483648
+; CHECK-NEXT: asr.w r1, r4, #31
; CHECK-NEXT: adc r1, r1, #0
-; CHECK-NEXT: lsrl r4, r1, #2
-; CHECK-NEXT: rsbs r1, r4, #0
+; CHECK-NEXT: lsrl r8, r1, #2
+; CHECK-NEXT: rsb.w r1, r8, #0
; CHECK-NEXT: str r1, [r2]
-; CHECK-NEXT: str r1, [r6, #-4]
-; CHECK-NEXT: adds r6, #4
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r1, #4
+; CHECK-NEXT: str r1, [r12, #-4]
+; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: .LBB2_3: @ %while.end
; CHECK-NEXT: add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index d076cb00ad7e0..edbbbf25aab0a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -355,8 +355,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: mov lr, r0
; CHECK-NEXT: subs r0, #1
; CHECK-NEXT: sbcs r0, r1, #0
@@ -375,7 +375,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: movw r2, #43691
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: movt r2, #43690
-; CHECK-NEXT: ldr r6, [sp, #128]
+; CHECK-NEXT: ldr r6, [sp, #120]
; CHECK-NEXT: movw r8, :lower16:c
; CHECK-NEXT: umull r1, r2, r1, r2
; CHECK-NEXT: movt r8, :upper16:c
@@ -384,7 +384,6 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ implicit-def: $r5
; CHECK-NEXT: @ implicit-def: $r11
; CHECK-NEXT: mov.w r9, #12
-; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: add.w r1, r1, r2, lsr #1
; CHECK-NEXT: add.w r0, r0, r2, lsr #1
; CHECK-NEXT: bic r3, r1, #3
@@ -395,7 +394,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: vdup.32 q6, r0
; CHECK-NEXT: vadd.i32 q4, q0, r7
; CHECK-NEXT: vdup.32 q7, r0
-; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r3, r7, [sp] @ 8-byte Folded Spill
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
@@ -444,21 +443,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: bhi .LBB1_17
; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: ldrd r2, r3, [sp, #120]
+; CHECK-NEXT: ldrd r2, r3, [sp, #112]
; CHECK-NEXT: movs r0, #32
; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: mov r4, r6
; CHECK-NEXT: mov r7, r12
; CHECK-NEXT: mov r6, lr
; CHECK-NEXT: bl __aeabi_ldivmod
; CHECK-NEXT: mov lr, r6
-; CHECK-NEXT: mov r6, r4
; CHECK-NEXT: mov r12, r7
-; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #120]
; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: b .LBB1_10
; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
@@ -573,7 +570,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: b .LBB1_27
; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 28166e455aba2..4c0ded4515b65 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -999,7 +999,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: ldrh r6, [r0]
; CHECK-NEXT: movs r5, #1
-; CHECK-NEXT: ldrd r4, r10, [r0, #4]
+; CHECK-NEXT: ldrd r4, r9, [r0, #4]
; CHECK-NEXT: sub.w r0, r6, #8
; CHECK-NEXT: add.w r3, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
@@ -1008,10 +1008,11 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: it gt
; CHECK-NEXT: asrgt r5, r3, #3
; CHECK-NEXT: add.w r3, r4, r6, lsl #2
-; CHECK-NEXT: sub.w r9, r3, #4
+; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: str r3, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: rsbs r3, r6, #0
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r3, r10, #32
+; CHECK-NEXT: add.w r3, r9, #32
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
@@ -1024,8 +1025,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_4: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT: ldrd r0, r1, [sp, #20] @ 8-byte Folded Reload
; CHECK-NEXT: wls lr, r0, .LBB16_5
; CHECK-NEXT: b .LBB16_10
; CHECK-NEXT: .LBB16_5: @ %while.end
@@ -1040,15 +1040,17 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
-; CHECK-NEXT: ldrd r3, r7, [r10]
-; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
-; CHECK-NEXT: ldrd r11, r8, [r10, #24]
-; CHECK-NEXT: vstrb.8 q0, [r9], #16
+; CHECK-NEXT: add.w lr, r9, #8
+; CHECK-NEXT: ldrd r3, r7, [r9]
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldm.w lr, {r0, r5, lr}
+; CHECK-NEXT: ldrd r10, r11, [r9, #20]
+; CHECK-NEXT: ldr.w r8, [r9, #28]
+; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q0, [r4], #32
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
; CHECK-NEXT: vmul.f32 q0, q0, r3
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
@@ -1059,9 +1061,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q4, r5
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
-; CHECK-NEXT: vfma.f32 q0, q5, r6
+; CHECK-NEXT: vfma.f32 q0, q5, lr
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
-; CHECK-NEXT: vfma.f32 q0, q2, lr
+; CHECK-NEXT: vfma.f32 q0, q2, r10
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: vfma.f32 q0, q1, r8
@@ -1075,25 +1077,26 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6}
; CHECK-NEXT: vldrw.u32 q1, [r4], #32
+; CHECK-NEXT: add.w r11, r7, #16
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT: ldm.w r11, {r1, r8, r10, r11}
; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q1, r3
-; CHECK-NEXT: ldrd r9, r1, [r7, #24]
+; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q6, r5
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
; CHECK-NEXT: vfma.f32 q0, q4, r6
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
-; CHECK-NEXT: vfma.f32 q0, q5, r8
+; CHECK-NEXT: vfma.f32 q0, q5, r1
; CHECK-NEXT: adds r7, #32
-; CHECK-NEXT: vfma.f32 q0, q2, r11
-; CHECK-NEXT: vfma.f32 q0, q3, r9
-; CHECK-NEXT: vfma.f32 q0, q1, r1
+; CHECK-NEXT: vfma.f32 q0, q2, r8
+; CHECK-NEXT: vfma.f32 q0, q3, r10
+; CHECK-NEXT: vfma.f32 q0, q1, r11
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
index 652d25af02e7c..8fe310bd3d5e3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -180,15 +180,15 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add.w r2, r9, r10
-; CHECK-NEXT: add.w r7, r1, r9, lsl #1
+; CHECK-NEXT: add.w r5, r1, r9, lsl #1
; CHECK-NEXT: add.w r2, r1, r2, lsl #1
-; CHECK-NEXT: sub.w r5, r8, r9
-; CHECK-NEXT: dlstp.32 lr, r5
+; CHECK-NEXT: sub.w r7, r8, r9
+; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: rsbs r4, r3, #0
-; CHECK-NEXT: vldrh.s32 q0, [r7], #8
+; CHECK-NEXT: vldrh.s32 q0, [r5], #8
; CHECK-NEXT: vldrh.s32 q1, [r2], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vshl.s32 q0, r4
diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
index da59cb259db61..22deb23cad27e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
@@ -548,44 +548,43 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: letp lr, .LBB19_1
; CHECK-NEXT: .LBB19_2: @ %entry
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: movw r6, :lower16:arr_20
-; CHECK-NEXT: movt r6, :upper16:arr_20
-; CHECK-NEXT: add.w r3, r6, #80
+; CHECK-NEXT: movw r11, :lower16:arr_20
+; CHECK-NEXT: adr r6, .LCPI19_0
+; CHECK-NEXT: movt r11, :upper16:arr_20
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: movw r0, :lower16:arr_21
; CHECK-NEXT: movt r0, :upper16:arr_21
; CHECK-NEXT: add.w r5, r0, #36
-; CHECK-NEXT: add.w r11, r6, #128
-; CHECK-NEXT: add.w r7, r6, #112
-; CHECK-NEXT: add.w r2, r6, #96
-; CHECK-NEXT: add.w r4, r6, #64
-; CHECK-NEXT: add.w r0, r6, #48
-; CHECK-NEXT: add.w r1, r6, #32
-; CHECK-NEXT: add.w r12, r6, #16
-; CHECK-NEXT: adr r6, .LCPI19_0
-; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: movw r6, :lower16:arr_20
+; CHECK-NEXT: add.w r3, r11, #80
+; CHECK-NEXT: add.w r9, r11, #128
+; CHECK-NEXT: add.w r7, r11, #112
+; CHECK-NEXT: add.w r2, r11, #96
+; CHECK-NEXT: add.w r4, r11, #64
+; CHECK-NEXT: add.w r0, r11, #48
+; CHECK-NEXT: add.w r1, r11, #32
+; CHECK-NEXT: add.w r12, r11, #16
; CHECK-NEXT: mov.w r8, #327685
-; CHECK-NEXT: mov.w r9, #5
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: mov r6, r8
+; CHECK-NEXT: mov.w r10, #5
; CHECK-NEXT: vmov.i16 q1, #0x5
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: movt r6, :upper16:arr_20
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: .LBB19_3: @ %for.cond8.preheader
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: str r8, [r5, #-4]
+; CHECK-NEXT: str r6, [r5, #-4]
; CHECK-NEXT: vstrh.16 q1, [r5, #-36]
-; CHECK-NEXT: strh.w r9, [r5]
+; CHECK-NEXT: strh.w r10, [r5]
; CHECK-NEXT: vstrh.16 q1, [r5, #-20]
; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: vstrh.16 q0, [r12], #152
-; CHECK-NEXT: vstrh.16 q0, [r6], #152
+; CHECK-NEXT: vstrh.16 q0, [r11], #152
; CHECK-NEXT: vstrh.16 q0, [r1], #152
; CHECK-NEXT: vstrh.16 q0, [r0], #152
; CHECK-NEXT: vstrh.16 q0, [r4], #152
; CHECK-NEXT: vstrh.16 q0, [r2], #152
; CHECK-NEXT: vstrh.16 q0, [r7], #152
-; CHECK-NEXT: vstrh.16 q0, [r11], #152
-; CHECK-NEXT: strd r9, r10, [r3, #64]
+; CHECK-NEXT: vstrh.16 q0, [r9], #152
+; CHECK-NEXT: strd r10, r8, [r3, #64]
; CHECK-NEXT: adds r5, #38
; CHECK-NEXT: adds r3, #152
; CHECK-NEXT: le lr, .LBB19_3
@@ -601,46 +600,46 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: vstrb.8 q1, [r0], #16
; CHECK-NEXT: letp lr, .LBB19_5
; CHECK-NEXT: .LBB19_6: @ %for.cond.cleanup6
-; CHECK-NEXT: movw r6, :lower16:arr_20
+; CHECK-NEXT: movw r2, :lower16:arr_20
; CHECK-NEXT: movw r0, #7376
-; CHECK-NEXT: movt r6, :upper16:arr_20
-; CHECK-NEXT: adds r3, r6, r0
+; CHECK-NEXT: movt r2, :upper16:arr_20
+; CHECK-NEXT: adds r3, r2, r0
; CHECK-NEXT: movw r0, #7408
-; CHECK-NEXT: add.w r12, r6, r0
+; CHECK-NEXT: add.w r12, r2, r0
; CHECK-NEXT: movw r0, #7344
-; CHECK-NEXT: add.w r9, r6, r0
+; CHECK-NEXT: add.w r11, r2, r0
; CHECK-NEXT: movw r0, #7312
-; CHECK-NEXT: adds r2, r6, r0
+; CHECK-NEXT: add.w r9, r2, r0
; CHECK-NEXT: movw r0, :lower16:arr_21
-; CHECK-NEXT: add.w r1, r6, #7424
-; CHECK-NEXT: add.w r7, r6, #7392
-; CHECK-NEXT: add.w r4, r6, #7360
-; CHECK-NEXT: add.w r5, r6, #7328
-; CHECK-NEXT: add.w r8, r6, #7296
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, #7424
+; CHECK-NEXT: add.w r7, r2, #7392
+; CHECK-NEXT: add.w r4, r2, #7360
+; CHECK-NEXT: add.w r5, r2, #7328
+; CHECK-NEXT: add.w r6, r2, #7296
+; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: movt r0, :upper16:arr_21
; CHECK-NEXT: addw r0, r0, #1860
; CHECK-NEXT: mov.w r10, #5
-; CHECK-NEXT: dls lr, r6
-; CHECK-NEXT: mov.w r6, #327685
+; CHECK-NEXT: dls lr, r2
+; CHECK-NEXT: mov.w r2, #327685
; CHECK-NEXT: vmov.i16 q1, #0x5
-; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: .LBB19_7: @ %for.cond8.preheader.1
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: str r6, [r0, #-4]
+; CHECK-NEXT: str r2, [r0, #-4]
; CHECK-NEXT: vstrh.16 q1, [r0, #-36]
; CHECK-NEXT: strh.w r10, [r0]
; CHECK-NEXT: vstrh.16 q1, [r0, #-20]
; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: vstrh.16 q0, [r2], #152
-; CHECK-NEXT: vstrh.16 q0, [r8], #152
-; CHECK-NEXT: vstrh.16 q0, [r5], #152
; CHECK-NEXT: vstrh.16 q0, [r9], #152
+; CHECK-NEXT: vstrh.16 q0, [r6], #152
+; CHECK-NEXT: vstrh.16 q0, [r5], #152
+; CHECK-NEXT: vstrh.16 q0, [r11], #152
; CHECK-NEXT: vstrh.16 q0, [r4], #152
; CHECK-NEXT: vstrh.16 q0, [r7], #152
; CHECK-NEXT: vstrh.16 q0, [r12], #152
; CHECK-NEXT: vstrh.16 q0, [r1], #152
-; CHECK-NEXT: strd r10, r11, [r3, #64]
+; CHECK-NEXT: strd r10, r8, [r3, #64]
; CHECK-NEXT: adds r0, #38
; CHECK-NEXT: adds r3, #152
; CHECK-NEXT: le lr, .LBB19_7
@@ -663,7 +662,7 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: movw r0, #14704
; CHECK-NEXT: add.w r12, r7, r0
; CHECK-NEXT: movw r0, #14688
-; CHECK-NEXT: add.w r8, r7, r0
+; CHECK-NEXT: add.w r11, r7, r0
; CHECK-NEXT: movw r0, #14640
; CHECK-NEXT: add.w r9, r7, r0
; CHECK-NEXT: movw r0, #14624
@@ -681,7 +680,7 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: dls lr, r7
; CHECK-NEXT: mov.w r7, #327685
; CHECK-NEXT: vmov.i16 q1, #0x5
-; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: .LBB19_11: @ %for.cond8.preheader.2
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: str r7, [r1, #-4]
@@ -694,10 +693,10 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: vstrh.16 q0, [r2], #152
; CHECK-NEXT: vstrh.16 q0, [r9], #152
; CHECK-NEXT: vstrh.16 q0, [r5], #152
-; CHECK-NEXT: vstrh.16 q0, [r8], #152
+; CHECK-NEXT: vstrh.16 q0, [r11], #152
; CHECK-NEXT: vstrh.16 q0, [r12], #152
; CHECK-NEXT: vstrh.16 q0, [r4], #152
-; CHECK-NEXT: strd r10, r11, [r3, #64]
+; CHECK-NEXT: strd r10, r8, [r3, #64]
; CHECK-NEXT: adds r1, #38
; CHECK-NEXT: adds r3, #152
; CHECK-NEXT: le lr, .LBB19_11
@@ -721,9 +720,9 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: movt r7, :upper16:arr_20
; CHECK-NEXT: add.w r12, r7, r1
; CHECK-NEXT: movw r1, #21984
-; CHECK-NEXT: add.w r8, r7, r1
+; CHECK-NEXT: add.w r10, r7, r1
; CHECK-NEXT: movw r1, #21952
-; CHECK-NEXT: add.w r9, r7, r1
+; CHECK-NEXT: add.w r8, r7, r1
; CHECK-NEXT: movw r1, #21936
; CHECK-NEXT: movw r0, #21968
; CHECK-NEXT: adds r5, r7, r1
@@ -735,7 +734,7 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: add.w r3, r7, #22016
; CHECK-NEXT: add.w r6, r7, #21888
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: mov.w r10, #5
+; CHECK-NEXT: mov.w r9, #5
; CHECK-NEXT: vmov.i16 q1, #0x5
; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: dls lr, r7
@@ -744,18 +743,18 @@ define i32 @reverted(i1 zeroext %b) {
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: str r7, [r2, #-4]
; CHECK-NEXT: vstrh.16 q1, [r2, #-36]
-; CHECK-NEXT: strh.w r10, [r2]
+; CHECK-NEXT: strh.w r9, [r2]
; CHECK-NEXT: vstrh.16 q1, [r2, #-20]
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrh.16 q0, [r4], #152
; CHECK-NEXT: vstrh.16 q0, [r6], #152
; CHECK-NEXT: vstrh.16 q0, [r1], #152
; CHECK-NEXT: vstrh.16 q0, [r5], #152
-; CHECK-NEXT: vstrh.16 q0, [r9], #152
; CHECK-NEXT: vstrh.16 q0, [r8], #152
+; CHECK-NEXT: vstrh.16 q0, [r10], #152
; CHECK-NEXT: vstrh.16 q0, [r12], #152
; CHECK-NEXT: vstrh.16 q0, [r3], #152
-; CHECK-NEXT: strd r10, r11, [r0, #64]
+; CHECK-NEXT: strd r9, r11, [r0, #64]
; CHECK-NEXT: adds r2, #38
; CHECK-NEXT: adds r0, #152
; CHECK-NEXT: le lr, .LBB19_15
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index dad856c0677a1..14ea3a3713224 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -14,8 +14,8 @@ define arm_aapcs_vfpcc void @k() {
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: adr r5, .LCPI0_0
; CHECK-NEXT: adr r4, .LCPI0_1
-; CHECK-NEXT: vldrw.u32 q6, [r5]
-; CHECK-NEXT: vldrw.u32 q5, [r4]
+; CHECK-NEXT: vldrw.u32 q5, [r5]
+; CHECK-NEXT: vldrw.u32 q6, [r4]
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: vmov.i8 q1, #0x0
@@ -25,14 +25,14 @@ define arm_aapcs_vfpcc void @k() {
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vand q5, q5, q0
; CHECK-NEXT: vand q6, q6, q0
-; CHECK-NEXT: vcmp.i32 eq, q5, zr
-; CHECK-NEXT: vpsel q5, q2, q1
+; CHECK-NEXT: vand q5, q5, q0
; CHECK-NEXT: vcmp.i32 eq, q6, zr
; CHECK-NEXT: vpsel q6, q2, q1
-; CHECK-NEXT: vstrh.32 q5, [r0]
-; CHECK-NEXT: vstrh.32 q6, [r0, #8]
+; CHECK-NEXT: vcmp.i32 eq, q5, zr
+; CHECK-NEXT: vpsel q5, q2, q1
+; CHECK-NEXT: vstrh.32 q6, [r0]
+; CHECK-NEXT: vstrh.32 q5, [r0, #8]
; CHECK-NEXT: vldrw.u32 q5, [r0]
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vmov.i32 q5, #0x0
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index 43ed5eefbf4c7..ff5a27149cb2e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -17,16 +17,16 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: mov.w r5, #0
; CHECK-NEXT: csel r7, r6, r5, hs
; CHECK-NEXT: add.w lr, r7, #1
-; CHECK-NEXT: mov r4, r5
+; CHECK-NEXT: mov r8, r5
; CHECK-NEXT: vldrh.u16 q0, [r0], #32
; CHECK-NEXT: movs r7, #0
-; CHECK-NEXT: mov r8, r5
+; CHECK-NEXT: mov r6, r5
; CHECK-NEXT: vldrh.u16 q1, [r1], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q0, q1
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q0, q1
; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q3
; CHECK-NEXT: vldrh.u16 q0, [r1], #32
; CHECK-NEXT: sub.w lr, lr, #1
; CHECK-NEXT: cmp.w lr, #0
@@ -35,75 +35,72 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q3
; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q3
; CHECK-NEXT: vldrh.u16 q0, [r1], #32
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
-; CHECK-NEXT: movs r6, #14
-; CHECK-NEXT: and.w r2, r6, r2, lsl #1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q3
+; CHECK-NEXT: movs r4, #14
+; CHECK-NEXT: and.w r2, r4, r2, lsl #1
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r1, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q0
; CHECK-NEXT: vctp.16 r2
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u16 q1, [r0]
; CHECK-NEXT: cmp r2, #9
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrht.u16 q0, [r1]
-; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0
-; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0
-; CHECK-NEXT: blo .LBB0_10
+; CHECK-NEXT: vmlsldavat.s16 r8, r7, q1, q0
+; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q1, q0
+; CHECK-NEXT: blo .LBB0_9
; CHECK-NEXT: @ %bb.4: @ %do.body.1
; CHECK-NEXT: subs r2, #8
; CHECK-NEXT: vctp.16 r2
; CHECK-NEXT: vpstttt
; CHECK-NEXT: vldrht.u16 q0, [r0, #16]
; CHECK-NEXT: vldrht.u16 q1, [r1, #16]
-; CHECK-NEXT: vmlsldavat.s16 r4, r7, q0, q1
-; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q0, q1
-; CHECK-NEXT: b .LBB0_10
+; CHECK-NEXT: vmlsldavat.s16 r8, r7, q0, q1
+; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q0, q1
+; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_5: @ %if.else
-; CHECK-NEXT: mov.w r4, #0
-; CHECK-NEXT: cbz r2, .LBB0_9
+; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: cbz r2, .LBB0_8
; CHECK-NEXT: @ %bb.6: @ %while.body14.preheader
-; CHECK-NEXT: lsls r6, r2, #1
-; CHECK-NEXT: mov r5, r4
-; CHECK-NEXT: mov r7, r4
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: dlstp.16 lr, r6
+; CHECK-NEXT: lsls r4, r2, #1
+; CHECK-NEXT: mov r5, r8
+; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: mov r7, r8
+; CHECK-NEXT: dlstp.16 lr, r4
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_7: @ %while.body14
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
-; CHECK-NEXT: vmlsldava.s16 r2, r7, q0, q1
-; CHECK-NEXT: vmlaldavax.s16 r4, r5, q0, q1
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q0, q1
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q0, q1
; CHECK-NEXT: letp lr, .LBB0_7
-; CHECK-NEXT: @ %bb.8: @ %if.end.loopexit177
-; CHECK-NEXT: mov r8, r4
-; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: b .LBB0_10
+; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: .LBB0_9:
-; CHECK-NEXT: mov r7, r4
-; CHECK-NEXT: mov.w r8, #0
-; CHECK-NEXT: mov r5, r4
-; CHECK-NEXT: .LBB0_10: @ %if.end
-; CHECK-NEXT: asrl r4, r7, #6
-; CHECK-NEXT: asrl r8, r5, #6
-; CHECK-NEXT: str r4, [r3]
-; CHECK-NEXT: str.w r8, [r12]
+; CHECK-NEXT: .LBB0_8:
+; CHECK-NEXT: mov r7, r8
+; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: mov r5, r8
+; CHECK-NEXT: .LBB0_9: @ %if.end
+; CHECK-NEXT: asrl r8, r7, #6
+; CHECK-NEXT: asrl r6, r5, #6
+; CHECK-NEXT: str.w r8, [r3]
+; CHECK-NEXT: str.w r6, [r12]
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%cmp = icmp ugt i32 %numSamples, 15
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index f90af3cc5ba24..c987c4b537200 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -845,12 +845,12 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: .LBB5_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: add.w r12, r3, r5
+; CHECK-NEXT: add.w r9, r3, r5
; CHECK-NEXT: vldrw.u32 q6, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [r3], #16
-; CHECK-NEXT: add.w r10, r12, r5
+; CHECK-NEXT: add.w r10, r9, r5
; CHECK-NEXT: vfma.f32 q4, q7, q6
-; CHECK-NEXT: vldrw.u32 q7, [r12]
+; CHECK-NEXT: vldrw.u32 q7, [r9]
; CHECK-NEXT: add.w r6, r10, r5
; CHECK-NEXT: vfma.f32 q5, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [r10]
@@ -1093,18 +1093,16 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vfmat.f32 q5, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r10]
; CHECK-NEXT: add.w r6, r11, r5
+; CHECK-NEXT: vstrw.32 q5, [sp, #40] @ 16-byte Spill
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r11]
-; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vfmat.f32 q1, q0, q7
; CHECK-NEXT: vmov q5, q4
; CHECK-NEXT: vmov q4, q3
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vfmat.f32 q1, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstt
@@ -1122,8 +1120,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q4, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r7]
-; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q2, q0, q7
; CHECK-NEXT: le lr, .LBB6_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
index 096d4382d2c35..bd0e5dabea3cf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
@@ -8,23 +8,24 @@ define i32 @vaddv(ptr nocapture readonly %data, i32 %N) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: mov lr, r1
; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB0_4
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: mov r1, r0
-; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: dls lr, r1
+; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: .LBB0_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r1], #32
-; CHECK-NEXT: vaddva.s32 r0, q0
-; CHECK-NEXT: vldrw.u32 q0, [r1, #-16]
-; CHECK-NEXT: vaddva.s32 r0, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0], #32
+; CHECK-NEXT: vaddva.s32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #-16]
+; CHECK-NEXT: vaddva.s32 r2, q0
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp11 = icmp sgt i32 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index cba0f9cbba2ca..3e7ed0b096b82 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -212,9 +212,9 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
; CHECK-NEXT: beq .LBB2_8
; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph
; CHECK-NEXT: ldr r3, [sp, #64]
-; CHECK-NEXT: mov.w r9, #0
+; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: ldr.w r11, [sp, #56]
+; CHECK-NEXT: ldr.w r9, [sp, #56]
; CHECK-NEXT: add.w r0, r1, r3, lsl #1
; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: adds r0, r1, r3
@@ -235,15 +235,15 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
; CHECK-NEXT: add.w r1, r8, r10
; CHECK-NEXT: add r1, r6
; CHECK-NEXT: add r1, r12
-; CHECK-NEXT: strb.w r1, [r3, r9]
-; CHECK-NEXT: add.w r9, r9, #1
-; CHECK-NEXT: cmp r9, r2
+; CHECK-NEXT: strb.w r1, [r3, r11]
+; CHECK-NEXT: add.w r11, r11, #1
+; CHECK-NEXT: cmp r11, r2
; CHECK-NEXT: beq .LBB2_8
; CHECK-NEXT: .LBB2_5: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB2_7 Depth 2
; CHECK-NEXT: ldr r1, [sp, #68]
-; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2]
+; CHECK-NEXT: ldr.w r12, [r1, r11, lsl #2]
; CHECK-NEXT: subs r1, r0, r0
; CHECK-NEXT: ble .LBB2_3
; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader
@@ -254,7 +254,7 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: mov r8, r12
-; CHECK-NEXT: mla r7, r9, r7, r3
+; CHECK-NEXT: mla r7, r11, r7, r3
; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: mov r10, r12
@@ -262,17 +262,17 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
; CHECK-NEXT: @ Parent Loop BB2_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrb.s16 q0, [r4], #8
-; CHECK-NEXT: vadd.i16 q1, q0, r11
+; CHECK-NEXT: vadd.i16 q1, q0, r9
; CHECK-NEXT: vldrb.s16 q0, [r7], #8
; CHECK-NEXT: vmlava.s16 r12, q0, q1
; CHECK-NEXT: vldrb.s16 q1, [r5], #8
-; CHECK-NEXT: vadd.i16 q1, q1, r11
+; CHECK-NEXT: vadd.i16 q1, q1, r9
; CHECK-NEXT: vmlava.s16 r6, q0, q1
; CHECK-NEXT: vldrb.s16 q1, [r3], #8
-; CHECK-NEXT: vadd.i16 q1, q1, r11
+; CHECK-NEXT: vadd.i16 q1, q1, r9
; CHECK-NEXT: vmlava.s16 r8, q0, q1
; CHECK-NEXT: vldrb.s16 q1, [r1], #8
-; CHECK-NEXT: vadd.i16 q1, q1, r11
+; CHECK-NEXT: vadd.i16 q1, q1, r9
; CHECK-NEXT: vmlava.s16 r10, q0, q1
; CHECK-NEXT: le lr, .LBB2_7
; CHECK-NEXT: b .LBB2_4
@@ -395,9 +395,9 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
; CHECK-NEXT: beq .LBB3_8
; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph
; CHECK-NEXT: ldr r3, [sp, #64]
-; CHECK-NEXT: mov.w r9, #0
+; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: ldr.w r11, [sp, #56]
+; CHECK-NEXT: ldr.w r9, [sp, #56]
; CHECK-NEXT: add.w r0, r1, r3, lsl #1
; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: adds r0, r1, r3
@@ -411,7 +411,7 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_5 Depth 2
; CHECK-NEXT: ldr r1, [sp, #68]
-; CHECK-NEXT: ldr.w r12, [r1, r9, lsl #2]
+; CHECK-NEXT: ldr.w r12, [r1, r11, lsl #2]
; CHECK-NEXT: subs r1, r0, r0
; CHECK-NEXT: ble .LBB3_6
; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader
@@ -422,7 +422,7 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: mov r8, r12
-; CHECK-NEXT: mla r7, r9, r7, r3
+; CHECK-NEXT: mla r7, r11, r7, r3
; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldrd r4, r3, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: mov r10, r12
@@ -430,17 +430,17 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
; CHECK-NEXT: @ Parent Loop BB3_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrb.s16 q0, [r4], #8
-; CHECK-NEXT: vadd.i16 q1, q0, r11
+; CHECK-NEXT: vadd.i16 q1, q0, r9
; CHECK-NEXT: vldrb.s16 q0, [r7], #8
; CHECK-NEXT: vmlava.s16 r12, q0, q1
; CHECK-NEXT: vldrb.s16 q1, [r5], #8
-; CHECK-NEXT: vadd.i16 q1, q1, r11
+; CHECK-NEXT: vadd.i16 q1, q1, r9
; CHECK-NEXT: vmlava.s16 r6, q0, q1
; CHECK-NEXT: vldrb.s16 q1, [r3], #8
-; CHECK-NEXT: vadd.i16 q1, q1, r11
+; CHECK-NEXT: vadd.i16 q1, q1, r9
; CHECK-NEXT: vmlava.s16 r8, q0, q1
; CHECK-NEXT: vldrb.s16 q1, [r1], #8
-; CHECK-NEXT: vadd.i16 q1, q1, r11
+; CHECK-NEXT: vadd.i16 q1, q1, r9
; CHECK-NEXT: vmlava.s16 r10, q0, q1
; CHECK-NEXT: le lr, .LBB3_5
; CHECK-NEXT: b .LBB3_7
@@ -454,9 +454,9 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
; CHECK-NEXT: add.w r1, r8, r10
; CHECK-NEXT: add r1, r6
; CHECK-NEXT: add r1, r12
-; CHECK-NEXT: strb.w r1, [r3, r9]
-; CHECK-NEXT: add.w r9, r9, #1
-; CHECK-NEXT: cmp r9, r2
+; CHECK-NEXT: strb.w r1, [r3, r11]
+; CHECK-NEXT: add.w r11, r11, #1
+; CHECK-NEXT: cmp r11, r2
; CHECK-NEXT: bne .LBB3_3
; CHECK-NEXT: .LBB3_8: @ %if.end
; CHECK-NEXT: ldr r0, [sp, #72]
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 29b56639bd769..16e7736886e94 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -12,12 +12,13 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: beq.w .LBB0_8
; CHECK-NEXT: @ %bb.1: @ %entry
; CHECK-NEXT: mov r11, r2
+; CHECK-NEXT: mov r8, r1
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: bne .LBB0_3
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: mov r8, r1
+; CHECK-NEXT: mov r1, r8
; CHECK-NEXT: mov r10, r11
; CHECK-NEXT: b .LBB0_6
; CHECK-NEXT: .LBB0_3: @ %vector.ph
@@ -29,7 +30,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: add.w r10, r11, r2, lsl #2
; CHECK-NEXT: add.w lr, r6, r7, lsr #1
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
-; CHECK-NEXT: add.w r8, r1, r2, lsl #2
+; CHECK-NEXT: add.w r1, r8, r2, lsl #2
; CHECK-NEXT: add.w r12, r0, r2, lsl #2
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: vmvn.i32 q1, #0x80000000
@@ -37,7 +38,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrd r4, r2, [r0], #8
; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: ldrd r7, r6, [r1], #8
+; CHECK-NEXT: ldrd r7, r6, [r8], #8
; CHECK-NEXT: smull r4, r7, r7, r4
; CHECK-NEXT: asrl r4, r7, #31
; CHECK-NEXT: rsbs.w r9, r4, #-2147483648
@@ -80,22 +81,22 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader
; CHECK-NEXT: sub.w lr, r3, r2
; CHECK-NEXT: mov.w r0, #-1
-; CHECK-NEXT: mov.w r1, #-2147483648
+; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: mvn r3, #-2147483648
; CHECK-NEXT: .LBB0_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr r2, [r12], #4
-; CHECK-NEXT: ldr r4, [r8], #4
-; CHECK-NEXT: smull r2, r5, r4, r2
-; CHECK-NEXT: asrl r2, r5, #31
-; CHECK-NEXT: subs r4, r1, r2
-; CHECK-NEXT: sbcs.w r4, r0, r5
-; CHECK-NEXT: csel r2, r2, r1, lt
-; CHECK-NEXT: csel r4, r5, r0, lt
-; CHECK-NEXT: subs r5, r2, r3
-; CHECK-NEXT: sbcs r4, r4, #0
-; CHECK-NEXT: csel r2, r2, r3, lt
-; CHECK-NEXT: str r2, [r10], #4
+; CHECK-NEXT: ldr r4, [r12], #4
+; CHECK-NEXT: ldr r5, [r1], #4
+; CHECK-NEXT: smull r4, r5, r5, r4
+; CHECK-NEXT: asrl r4, r5, #31
+; CHECK-NEXT: subs r6, r2, r4
+; CHECK-NEXT: sbcs.w r6, r0, r5
+; CHECK-NEXT: csel r4, r4, r2, lt
+; CHECK-NEXT: csel r5, r5, r0, lt
+; CHECK-NEXT: subs r6, r4, r3
+; CHECK-NEXT: sbcs r5, r5, #0
+; CHECK-NEXT: csel r4, r4, r3, lt
+; CHECK-NEXT: str r4, [r10], #4
; CHECK-NEXT: le lr, .LBB0_7
; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #8
diff --git a/llvm/test/CodeGen/Thumb2/pr52817.ll b/llvm/test/CodeGen/Thumb2/pr52817.ll
index 87615f0a1f7ef..4cc0960e1f57f 100644
--- a/llvm/test/CodeGen/Thumb2/pr52817.ll
+++ b/llvm/test/CodeGen/Thumb2/pr52817.ll
@@ -18,25 +18,25 @@ define i32 @test(ptr %arg, ptr %arg1, ptr %arg2) #0 !dbg !6 {
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: add r7, sp, #12
; CHECK-NEXT: str r8, [sp, #-4]!
-; CHECK-NEXT: mov.w lr, #0
-; CHECK-NEXT: mov.w r9, #1
-; CHECK-NEXT: movw r12, #4100
+; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: mov.w r12, #1
+; CHECK-NEXT: movw lr, #4100
; CHECK-NEXT: LBB0_1: @ %bb3
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r5, r3, #1
-; CHECK-NEXT: str.w lr, [r2]
-; CHECK-NEXT: cmp.w lr, #0
+; CHECK-NEXT: str.w r9, [r2]
+; CHECK-NEXT: cmp.w r9, #0
; CHECK-NEXT: add.w r4, r0, r5, lsl #2
-; CHECK-NEXT: add.w r8, r4, r12
-; CHECK-NEXT: lsl.w r4, r9, r3
+; CHECK-NEXT: add.w r8, r4, lr
+; CHECK-NEXT: lsl.w r4, r12, r3
; CHECK-NEXT: and.w r3, r3, r4
; CHECK-NEXT: add.w r4, r1, r5, lsl #2
; CHECK-NEXT: itte ne
; CHECK-NEXT: movne r6, #0
; CHECK-NEXT: Ltmp0:
; CHECK-NEXT: @DEBUG_VALUE: test:this <- [DW_OP_LLVM_arg 0, DW_OP_plus_uconst 135168, DW_OP_LLVM_arg 1, DW_OP_constu 4, DW_OP_mul, DW_OP_plus, DW_OP_plus_uconst 4, DW_OP_stack_value] $r0, $r5
-; CHECK-NEXT: .loc 1 28 24 prologue_end @ test.cpp:28:24
+; CHECK-NEXT: .loc 1 28 24 prologue_end @ test.cpp:28:24 @[ test.cpp:204:23 ]
; CHECK-NEXT: strne.w r6, [r8]
; CHECK-NEXT: moveq r6, #1
; CHECK-NEXT: ldr r4, [r4, #4]
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index fd880a7f42912..a418ef4892b33 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -641,53 +641,53 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
; PIC: # %bb.0:
; PIC-NEXT: st %s15, 24(, %s11)
; PIC-NEXT: st %s16, 32(, %s11)
-; PIC-NEXT: and %s2, %s0, (32)0
-; PIC-NEXT: adds.w.sx %s0, -1, %s2
-; PIC-NEXT: cmpu.w %s3, 8, %s0
+; PIC-NEXT: and %s0, %s0, (32)0
+; PIC-NEXT: adds.w.sx %s3, -1, %s0
+; PIC-NEXT: cmpu.w %s2, 8, %s3
; PIC-NEXT: lea %s15, _GLOBAL_OFFSET_TABLE_ at pc_lo(-24)
; PIC-NEXT: and %s15, %s15, (32)0
; PIC-NEXT: sic %s16
; PIC-NEXT: lea.sl %s15, _GLOBAL_OFFSET_TABLE_ at pc_hi(%s16, %s15)
-; PIC-NEXT: brgt.w 0, %s3, .LBB7_9
+; PIC-NEXT: brgt.w 0, %s2, .LBB7_9
; PIC-NEXT: # %bb.1:
-; PIC-NEXT: and %s1, %s1, (32)0
-; PIC-NEXT: adds.w.zx %s0, %s0, (0)1
-; PIC-NEXT: sll %s0, %s0, 2
+; PIC-NEXT: and %s2, %s1, (32)0
+; PIC-NEXT: adds.w.zx %s1, %s3, (0)1
+; PIC-NEXT: sll %s1, %s1, 2
; PIC-NEXT: lea %s3, .LJTI7_0 at gotoff_lo
; PIC-NEXT: and %s3, %s3, (32)0
; PIC-NEXT: lea.sl %s3, .LJTI7_0 at gotoff_hi(%s3, %s15)
-; PIC-NEXT: ldl.sx %s0, (%s0, %s3)
+; PIC-NEXT: ldl.sx %s1, (%s1, %s3)
; PIC-NEXT: lea %s3, br_jt8_m at gotoff_lo
; PIC-NEXT: and %s3, %s3, (32)0
; PIC-NEXT: lea.sl %s3, br_jt8_m at gotoff_hi(%s3, %s15)
-; PIC-NEXT: adds.l %s3, %s3, %s0
-; PIC-NEXT: or %s0, 3, (0)1
+; PIC-NEXT: adds.l %s3, %s3, %s1
+; PIC-NEXT: or %s1, 3, (0)1
; PIC-NEXT: b.l.t (, %s3)
; PIC-NEXT: .LBB7_2:
-; PIC-NEXT: or %s0, 0, (0)1
+; PIC-NEXT: or %s1, 0, (0)1
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_9:
-; PIC-NEXT: or %s0, 0, %s2
+; PIC-NEXT: or %s1, 0, %s0
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_6:
-; PIC-NEXT: adds.w.sx %s0, -2, %s1
+; PIC-NEXT: adds.w.sx %s1, -2, %s2
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_8:
-; PIC-NEXT: or %s0, 11, (0)1
+; PIC-NEXT: or %s1, 11, (0)1
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_7:
-; PIC-NEXT: or %s0, 10, (0)1
+; PIC-NEXT: or %s1, 10, (0)1
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_3:
-; PIC-NEXT: or %s0, 4, (0)1
+; PIC-NEXT: or %s1, 4, (0)1
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_4:
-; PIC-NEXT: adds.w.sx %s0, 3, %s1
+; PIC-NEXT: adds.w.sx %s1, 3, %s2
; PIC-NEXT: br.l.t .LBB7_10
; PIC-NEXT: .LBB7_5:
-; PIC-NEXT: adds.w.sx %s0, -5, %s1
+; PIC-NEXT: adds.w.sx %s1, -5, %s2
; PIC-NEXT: .LBB7_10:
-; PIC-NEXT: adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT: adds.w.sx %s0, %s1, (0)1
; PIC-NEXT: ld %s16, 32(, %s11)
; PIC-NEXT: ld %s15, 24(, %s11)
; PIC-NEXT: b.l.t (, %s10)
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 1962ddebc2115..99338d8f063f5 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -126,14 +126,14 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: jmp LBB0_8
; CHECK-NEXT: LBB0_18: ## %bb43
-; CHECK-NEXT: Ltmp5:
; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: Ltmp5:
; CHECK-NEXT: calll _OnOverFlow
; CHECK-NEXT: Ltmp6:
; CHECK-NEXT: jmp LBB0_3
; CHECK-NEXT: LBB0_2: ## %bb29
-; CHECK-NEXT: Ltmp7:
; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: Ltmp7:
; CHECK-NEXT: calll _OnOverFlow
; CHECK-NEXT: Ltmp8:
; CHECK-NEXT: LBB0_3: ## %bb30
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 06cf968512db8..8a8e7a3b4df2c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -297,30 +297,30 @@ define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movl $buf, %ecx
-; CHECK-NEXT: movl $32, %edx
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movl $buf, %edx
+; CHECK-NEXT: movl $32, %esi
; CHECK-NEXT: jmp .LBB5_1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_3: # %if.false
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: decl %esi
+; CHECK-NEXT: decl %eax
; CHECK-NEXT: .LBB5_4: # %loop.bb2
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: leal (%rdi,%rsi), %r8d
+; CHECK-NEXT: leal (%rdi,%rax), %r8d
; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: cmpw $7, %si
+; CHECK-NEXT: cmpw $7, %ax
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx)
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
; CHECK-NEXT: jne .LBB5_5
; CHECK-NEXT: .LBB5_1: # %loop.bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne .LBB5_3
; CHECK-NEXT: # %bb.2: # %if.true
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: incl %esi
+; CHECK-NEXT: incl %eax
; CHECK-NEXT: jmp .LBB5_4
; CHECK-NEXT: .LBB5_5: # %exit
; CHECK-NEXT: tilerelease
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 4fb0a4445862f..fda0244cea6eb 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -242,25 +242,25 @@ define dso_local void @test3(ptr%buf) nounwind {
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: movl $32, %r14d
-; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
+; CHECK-NEXT: tilestored %tmm0, (%rbx,%r15)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
-; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
+; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
+; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
+; CHECK-NEXT: tilestored %tmm0, (%rbx,%r15)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: incl %r15d
-; CHECK-NEXT: cmpw $100, %r15w
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpw $100, %r14w
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
; CHECK-NEXT: addq $72, %rsp
@@ -297,12 +297,12 @@ define dso_local void @test3(ptr%buf) nounwind {
; EGPR-NEXT: # fixup A - offset: 1, value: .LBB1_3-1, kind: FK_PCRel_1
; EGPR-NEXT: # %bb.1: # %loop.header.preheader
; EGPR-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
-; EGPR-NEXT: movl $32, %r14d # encoding: [0x41,0xbe,0x20,0x00,0x00,0x00]
-; EGPR-NEXT: xorl %r15d, %r15d # encoding: [0x45,0x31,0xff]
+; EGPR-NEXT: xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6]
+; EGPR-NEXT: movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00]
; EGPR-NEXT: .p2align 4
; EGPR-NEXT: .LBB1_2: # %loop.header
; EGPR-NEXT: # =>This Inner Loop Header: Depth=1
-; EGPR-NEXT: tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33]
+; EGPR-NEXT: tilestored %tmm0, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x3b]
; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -310,13 +310,13 @@ define dso_local void @test3(ptr%buf) nounwind {
; EGPR-NEXT: # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
-; EGPR-NEXT: tileloadd (%rbx,%r14), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x33]
-; EGPR-NEXT: tileloadd (%rbx,%r14), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x33]
+; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
+; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x3b]
; EGPR-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 # encoding: [0xc4,0xe2,0x6b,0x5e,0xc1]
-; EGPR-NEXT: tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33]
+; EGPR-NEXT: tilestored %tmm0, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x3b]
; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
-; EGPR-NEXT: incl %r15d # encoding: [0x41,0xff,0xc7]
-; EGPR-NEXT: cmpw $100, %r15w # encoding: [0x66,0x41,0x83,0xff,0x64]
+; EGPR-NEXT: incl %r14d # encoding: [0x41,0xff,0xc6]
+; EGPR-NEXT: cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64]
; EGPR-NEXT: jl .LBB1_2 # encoding: [0x7c,A]
; EGPR-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
; EGPR-NEXT: .LBB1_3: # %exit
diff --git a/llvm/test/CodeGen/X86/atomic32.ll b/llvm/test/CodeGen/X86/atomic32.ll
index f4666738db7d2..3263b9f61a281 100644
--- a/llvm/test/CodeGen/X86/atomic32.ll
+++ b/llvm/test/CodeGen/X86/atomic32.ll
@@ -228,9 +228,10 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
; X64-NEXT: andl %edx, %ecx
; X64-NEXT: notl %ecx
; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB5_2
; X64-NEXT: jmp .LBB5_1
; X64-NEXT: .LBB5_2: # %atomicrmw.end
@@ -251,9 +252,10 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
; X86-NEXT: andl %edx, %ecx
; X86-NEXT: notl %ecx
; X86-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NEXT: sete %cl
-; X86-NEXT: testb $1, %cl
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testb $1, %al
; X86-NEXT: jne .LBB5_2
; X86-NEXT: jmp .LBB5_1
; X86-NEXT: .LBB5_2: # %atomicrmw.end
@@ -277,9 +279,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
; X64-NEXT: subl %ecx, %edx
; X64-NEXT: cmovgl %eax, %ecx
; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB6_2
; X64-NEXT: jmp .LBB6_1
; X64-NEXT: .LBB6_2: # %atomicrmw.end
@@ -300,9 +303,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
; X86-CMOV-NEXT: subl %ecx, %edx
; X86-CMOV-NEXT: cmovgl %eax, %ecx
; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT: sete %cl
-; X86-CMOV-NEXT: testb $1, %cl
-; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: movl %eax, %ecx
+; X86-CMOV-NEXT: sete %al
+; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: testb $1, %al
; X86-CMOV-NEXT: jne .LBB6_2
; X86-CMOV-NEXT: jmp .LBB6_1
; X86-CMOV-NEXT: .LBB6_2: # %atomicrmw.end
@@ -334,9 +338,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT: sete %cl
-; X86-NOCMOV-NEXT: testb $1, %cl
-; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: movl %eax, %ecx
+; X86-NOCMOV-NEXT: sete %al
+; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: testb $1, %al
; X86-NOCMOV-NEXT: jne .LBB6_2
; X86-NOCMOV-NEXT: jmp .LBB6_1
; X86-NOCMOV-NEXT: .LBB6_2: # %atomicrmw.end
@@ -368,9 +373,10 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT: sete %cl
-; X86-NOX87-NEXT: testb $1, %cl
-; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: movl %eax, %ecx
+; X86-NOX87-NEXT: sete %al
+; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: testb $1, %al
; X86-NOX87-NEXT: jne .LBB6_2
; X86-NOX87-NEXT: jmp .LBB6_1
; X86-NOX87-NEXT: .LBB6_2: # %atomicrmw.end
@@ -394,9 +400,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
; X64-NEXT: subl %ecx, %edx
; X64-NEXT: cmovlel %eax, %ecx
; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB7_2
; X64-NEXT: jmp .LBB7_1
; X64-NEXT: .LBB7_2: # %atomicrmw.end
@@ -417,9 +424,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
; X86-CMOV-NEXT: subl %ecx, %edx
; X86-CMOV-NEXT: cmovlel %eax, %ecx
; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT: sete %cl
-; X86-CMOV-NEXT: testb $1, %cl
-; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: movl %eax, %ecx
+; X86-CMOV-NEXT: sete %al
+; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: testb $1, %al
; X86-CMOV-NEXT: jne .LBB7_2
; X86-CMOV-NEXT: jmp .LBB7_1
; X86-CMOV-NEXT: .LBB7_2: # %atomicrmw.end
@@ -451,9 +459,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT: sete %cl
-; X86-NOCMOV-NEXT: testb $1, %cl
-; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: movl %eax, %ecx
+; X86-NOCMOV-NEXT: sete %al
+; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: testb $1, %al
; X86-NOCMOV-NEXT: jne .LBB7_2
; X86-NOCMOV-NEXT: jmp .LBB7_1
; X86-NOCMOV-NEXT: .LBB7_2: # %atomicrmw.end
@@ -485,9 +494,10 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT: sete %cl
-; X86-NOX87-NEXT: testb $1, %cl
-; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: movl %eax, %ecx
+; X86-NOX87-NEXT: sete %al
+; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: testb $1, %al
; X86-NOX87-NEXT: jne .LBB7_2
; X86-NOX87-NEXT: jmp .LBB7_1
; X86-NOX87-NEXT: .LBB7_2: # %atomicrmw.end
@@ -511,9 +521,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
; X64-NEXT: subl %ecx, %edx
; X64-NEXT: cmoval %eax, %ecx
; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB8_2
; X64-NEXT: jmp .LBB8_1
; X64-NEXT: .LBB8_2: # %atomicrmw.end
@@ -534,9 +545,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
; X86-CMOV-NEXT: subl %ecx, %edx
; X86-CMOV-NEXT: cmoval %eax, %ecx
; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT: sete %cl
-; X86-CMOV-NEXT: testb $1, %cl
-; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: movl %eax, %ecx
+; X86-CMOV-NEXT: sete %al
+; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: testb $1, %al
; X86-CMOV-NEXT: jne .LBB8_2
; X86-CMOV-NEXT: jmp .LBB8_1
; X86-CMOV-NEXT: .LBB8_2: # %atomicrmw.end
@@ -568,9 +580,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT: sete %cl
-; X86-NOCMOV-NEXT: testb $1, %cl
-; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: movl %eax, %ecx
+; X86-NOCMOV-NEXT: sete %al
+; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: testb $1, %al
; X86-NOCMOV-NEXT: jne .LBB8_2
; X86-NOCMOV-NEXT: jmp .LBB8_1
; X86-NOCMOV-NEXT: .LBB8_2: # %atomicrmw.end
@@ -602,9 +615,10 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT: sete %cl
-; X86-NOX87-NEXT: testb $1, %cl
-; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: movl %eax, %ecx
+; X86-NOX87-NEXT: sete %al
+; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: testb $1, %al
; X86-NOX87-NEXT: jne .LBB8_2
; X86-NOX87-NEXT: jmp .LBB8_1
; X86-NOX87-NEXT: .LBB8_2: # %atomicrmw.end
@@ -628,9 +642,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
; X64-NEXT: subl %ecx, %edx
; X64-NEXT: cmovbel %eax, %ecx
; X64-NEXT: lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB9_2
; X64-NEXT: jmp .LBB9_1
; X64-NEXT: .LBB9_2: # %atomicrmw.end
@@ -651,9 +666,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
; X86-CMOV-NEXT: subl %ecx, %edx
; X86-CMOV-NEXT: cmovbel %eax, %ecx
; X86-CMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT: sete %cl
-; X86-CMOV-NEXT: testb $1, %cl
-; X86-CMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: movl %eax, %ecx
+; X86-CMOV-NEXT: sete %al
+; X86-CMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT: testb $1, %al
; X86-CMOV-NEXT: jne .LBB9_2
; X86-CMOV-NEXT: jmp .LBB9_1
; X86-CMOV-NEXT: .LBB9_2: # %atomicrmw.end
@@ -685,9 +701,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
; X86-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOCMOV-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT: sete %cl
-; X86-NOCMOV-NEXT: testb $1, %cl
-; X86-NOCMOV-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: movl %eax, %ecx
+; X86-NOCMOV-NEXT: sete %al
+; X86-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT: testb $1, %al
; X86-NOCMOV-NEXT: jne .LBB9_2
; X86-NOCMOV-NEXT: jmp .LBB9_1
; X86-NOCMOV-NEXT: .LBB9_2: # %atomicrmw.end
@@ -719,9 +736,10 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
; X86-NOX87-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NOX87-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NOX87-NEXT: lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT: sete %cl
-; X86-NOX87-NEXT: testb $1, %cl
-; X86-NOX87-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: movl %eax, %ecx
+; X86-NOX87-NEXT: sete %al
+; X86-NOX87-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT: testb $1, %al
; X86-NOX87-NEXT: jne .LBB9_2
; X86-NOX87-NEXT: jmp .LBB9_1
; X86-NOX87-NEXT: .LBB9_2: # %atomicrmw.end
diff --git a/llvm/test/CodeGen/X86/atomic64.ll b/llvm/test/CodeGen/X86/atomic64.ll
index 8f4da356e06cb..6fc72bcf67ec5 100644
--- a/llvm/test/CodeGen/X86/atomic64.ll
+++ b/llvm/test/CodeGen/X86/atomic64.ll
@@ -275,9 +275,10 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
; X64-NEXT: andq %rdx, %rcx
; X64-NEXT: notq %rcx
; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: sete %al
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB5_2
; X64-NEXT: jmp .LBB5_1
; X64-NEXT: .LBB5_2: # %atomicrmw.end
@@ -314,9 +315,10 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
; X64-NEXT: subq %rcx, %rdx
; X64-NEXT: cmovgq %rax, %rcx
; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: sete %al
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB6_2
; X64-NEXT: jmp .LBB6_1
; X64-NEXT: .LBB6_2: # %atomicrmw.end
@@ -406,9 +408,10 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
; X64-NEXT: subq %rcx, %rdx
; X64-NEXT: cmovleq %rax, %rcx
; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: sete %al
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB7_2
; X64-NEXT: jmp .LBB7_1
; X64-NEXT: .LBB7_2: # %atomicrmw.end
@@ -498,9 +501,10 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
; X64-NEXT: subq %rcx, %rdx
; X64-NEXT: cmovaq %rax, %rcx
; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: sete %al
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB8_2
; X64-NEXT: jmp .LBB8_1
; X64-NEXT: .LBB8_2: # %atomicrmw.end
@@ -590,9 +594,10 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
; X64-NEXT: subq %rcx, %rdx
; X64-NEXT: cmovbeq %rax, %rcx
; X64-NEXT: lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT: sete %cl
-; X64-NEXT: testb $1, %cl
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: sete %al
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: testb $1, %al
; X64-NEXT: jne .LBB9_2
; X64-NEXT: jmp .LBB9_1
; X64-NEXT: .LBB9_2: # %atomicrmw.end
diff --git a/llvm/test/CodeGen/X86/atomic6432.ll b/llvm/test/CodeGen/X86/atomic6432.ll
index 8ff5f338e1482..3d0617370a1bb 100644
--- a/llvm/test/CodeGen/X86/atomic6432.ll
+++ b/llvm/test/CodeGen/X86/atomic6432.ll
@@ -16,15 +16,15 @@ define void @atomic_fetch_add64() nounwind {
; X32-NEXT: jmp .LBB0_1
; X32-NEXT: .LBB0_1: # %atomicrmw.start14
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl $1, %ebx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl $0, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB0_1
; X32-NEXT: jmp .LBB0_2
; X32-NEXT: .LBB0_2: # %atomicrmw.end13
@@ -35,15 +35,15 @@ define void @atomic_fetch_add64() nounwind {
; X32-NEXT: jmp .LBB0_3
; X32-NEXT: .LBB0_3: # %atomicrmw.start8
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl $3, %ebx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl $0, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB0_3
; X32-NEXT: jmp .LBB0_4
; X32-NEXT: .LBB0_4: # %atomicrmw.end7
@@ -75,8 +75,8 @@ define void @atomic_fetch_add64() nounwind {
; X32-NEXT: jmp .LBB0_7
; X32-NEXT: .LBB0_7: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
@@ -84,8 +84,8 @@ define void @atomic_fetch_add64() nounwind {
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl %esi, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: jne .LBB0_7
; X32-NEXT: jmp .LBB0_8
; X32-NEXT: .LBB0_8: # %atomicrmw.end
@@ -114,15 +114,15 @@ define void @atomic_fetch_sub64() nounwind {
; X32-NEXT: jmp .LBB1_1
; X32-NEXT: .LBB1_1: # %atomicrmw.start14
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl $-1, %ebx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl $-1, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB1_1
; X32-NEXT: jmp .LBB1_2
; X32-NEXT: .LBB1_2: # %atomicrmw.end13
@@ -133,15 +133,15 @@ define void @atomic_fetch_sub64() nounwind {
; X32-NEXT: jmp .LBB1_3
; X32-NEXT: .LBB1_3: # %atomicrmw.start8
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: addl $-3, %ebx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl $-1, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB1_3
; X32-NEXT: jmp .LBB1_4
; X32-NEXT: .LBB1_4: # %atomicrmw.end7
@@ -173,8 +173,8 @@ define void @atomic_fetch_sub64() nounwind {
; X32-NEXT: jmp .LBB1_7
; X32-NEXT: .LBB1_7: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
@@ -182,8 +182,8 @@ define void @atomic_fetch_sub64() nounwind {
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: sbbl %esi, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: jne .LBB1_7
; X32-NEXT: jmp .LBB1_8
; X32-NEXT: .LBB1_8: # %atomicrmw.end
@@ -211,14 +211,14 @@ define void @atomic_fetch_and64() nounwind {
; X32-NEXT: jmp .LBB2_1
; X32-NEXT: .LBB2_1: # %atomicrmw.start8
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: andl $3, %ebx
; X32-NEXT: xorl %ecx, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB2_1
; X32-NEXT: jmp .LBB2_2
; X32-NEXT: .LBB2_2: # %atomicrmw.end7
@@ -250,8 +250,8 @@ define void @atomic_fetch_and64() nounwind {
; X32-NEXT: jmp .LBB2_5
; X32-NEXT: .LBB2_5: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
@@ -259,8 +259,8 @@ define void @atomic_fetch_and64() nounwind {
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: andl %esi, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: jne .LBB2_5
; X32-NEXT: jmp .LBB2_6
; X32-NEXT: .LBB2_6: # %atomicrmw.end
@@ -287,14 +287,14 @@ define void @atomic_fetch_or64() nounwind {
; X32-NEXT: jmp .LBB3_1
; X32-NEXT: .LBB3_1: # %atomicrmw.start8
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: orl $3, %ebx
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB3_1
; X32-NEXT: jmp .LBB3_2
; X32-NEXT: .LBB3_2: # %atomicrmw.end7
@@ -326,8 +326,8 @@ define void @atomic_fetch_or64() nounwind {
; X32-NEXT: jmp .LBB3_5
; X32-NEXT: .LBB3_5: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
@@ -335,8 +335,8 @@ define void @atomic_fetch_or64() nounwind {
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: orl %esi, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: jne .LBB3_5
; X32-NEXT: jmp .LBB3_6
; X32-NEXT: .LBB3_6: # %atomicrmw.end
@@ -363,14 +363,14 @@ define void @atomic_fetch_xor64() nounwind {
; X32-NEXT: jmp .LBB4_1
; X32-NEXT: .LBB4_1: # %atomicrmw.start8
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
; X32-NEXT: xorl $3, %ebx
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB4_1
; X32-NEXT: jmp .LBB4_2
; X32-NEXT: .LBB4_2: # %atomicrmw.end7
@@ -402,8 +402,8 @@ define void @atomic_fetch_xor64() nounwind {
; X32-NEXT: jmp .LBB4_5
; X32-NEXT: .LBB4_5: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X32-NEXT: movl %eax, %ebx
@@ -411,8 +411,8 @@ define void @atomic_fetch_xor64() nounwind {
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: xorl %esi, %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: jne .LBB4_5
; X32-NEXT: jmp .LBB4_6
; X32-NEXT: .LBB4_6: # %atomicrmw.end
@@ -444,8 +444,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
; X32-NEXT: jmp .LBB5_1
; X32-NEXT: .LBB5_1: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X32-NEXT: movl (%esp), %edi # 4-byte Reload
; X32-NEXT: movl %edx, %ecx
@@ -455,8 +455,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
; X32-NEXT: notl %ebx
; X32-NEXT: notl %ecx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB5_1
; X32-NEXT: jmp .LBB5_2
; X32-NEXT: .LBB5_2: # %atomicrmw.end
@@ -486,8 +486,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
; X32-NEXT: jmp .LBB6_1
; X32-NEXT: .LBB6_1: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: movl %ebx, %esi
@@ -497,8 +497,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
; X32-NEXT: cmovll %edx, %ecx
; X32-NEXT: cmovll %eax, %ebx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB6_1
; X32-NEXT: jmp .LBB6_2
; X32-NEXT: .LBB6_2: # %atomicrmw.end
@@ -527,8 +527,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
; X32-NEXT: jmp .LBB7_1
; X32-NEXT: .LBB7_1: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: movl %ebx, %esi
@@ -538,8 +538,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
; X32-NEXT: cmovgel %edx, %ecx
; X32-NEXT: cmovgel %eax, %ebx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB7_1
; X32-NEXT: jmp .LBB7_2
; X32-NEXT: .LBB7_2: # %atomicrmw.end
@@ -568,8 +568,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
; X32-NEXT: jmp .LBB8_1
; X32-NEXT: .LBB8_1: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: movl %ebx, %esi
@@ -579,8 +579,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
; X32-NEXT: cmovbl %edx, %ecx
; X32-NEXT: cmovbl %eax, %ebx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB8_1
; X32-NEXT: jmp .LBB8_2
; X32-NEXT: .LBB8_2: # %atomicrmw.end
@@ -609,8 +609,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
; X32-NEXT: jmp .LBB9_1
; X32-NEXT: .LBB9_1: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: movl %ebx, %esi
@@ -620,8 +620,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
; X32-NEXT: cmovael %edx, %ecx
; X32-NEXT: cmovael %eax, %ebx
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB9_1
; X32-NEXT: jmp .LBB9_2
; X32-NEXT: .LBB9_2: # %atomicrmw.end
@@ -677,13 +677,13 @@ define void @atomic_fetch_swap64(i64 %x) nounwind {
; X32-NEXT: jmp .LBB12_1
; X32-NEXT: .LBB12_1: # %atomicrmw.start
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: lock cmpxchg8b sc64
-; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X32-NEXT: jne .LBB12_1
; X32-NEXT: jmp .LBB12_2
; X32-NEXT: .LBB12_2: # %atomicrmw.end
diff --git a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
index 3d389523dffb3..95faca819e975 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
@@ -17,7 +17,7 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind {
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl %edx, %ebp
-; CHECK-NEXT: movl %esi, %r12d
+; CHECK-NEXT: movl %esi, %r13d
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: callq c
; CHECK-NEXT: movl %eax, %r14d
@@ -28,17 +28,17 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind {
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: cmpl $0, e(%rip)
; CHECK-NEXT: # implicit-def: $r15d
-; CHECK-NEXT: # implicit-def: $r13d
+; CHECK-NEXT: # implicit-def: $r12d
; CHECK-NEXT: je .LBB0_4
; CHECK-NEXT: # %bb.2: # %if.then4
-; CHECK-NEXT: movslq %r12d, %rdi
+; CHECK-NEXT: movslq %r13d, %rdi
; CHECK-NEXT: callq m
; CHECK-NEXT: # implicit-def: $r15d
; CHECK-NEXT: # implicit-def: $r12d
; CHECK-NEXT: .LBB0_3: # %r
; CHECK-NEXT: callq c
-; CHECK-NEXT: movl %r12d, %r13d
; CHECK-NEXT: .LBB0_4: # %if.end8
+; CHECK-NEXT: movl %r12d, %r13d
; CHECK-NEXT: movl %r15d, %edi
; CHECK-NEXT: callq i
; CHECK-NEXT: movl %eax, %r12d
diff --git a/llvm/test/CodeGen/X86/callbr-asm-kill.mir b/llvm/test/CodeGen/X86/callbr-asm-kill.mir
index 5aabeade52da1..58a9e4de77faf 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-kill.mir
+++ b/llvm/test/CodeGen/X86/callbr-asm-kill.mir
@@ -6,14 +6,12 @@
# subsequent use of [[MOV64rm]] in the INLINEASM_BR instruction which should be
# killed instead.
--- |
- ; ModuleID = '<stdin>'
source_filename = "<stdin>"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
declare void @foo(ptr)
- ; Function Attrs: nounwind
define void @test1(ptr %arg, ptr %mem) #0 {
entry:
br label %loop
@@ -57,18 +55,17 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY killed $rsi
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY killed $rdi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]]
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.loop (ir-block-address-taken %ir-block.loop, inlineasm-br-indirect-target):
; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.1(0x00000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY killed [[COPY2]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]]
; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s64) from %ir.mem)
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: $rdi = COPY killed [[COPY3]]
+ ; CHECK-NEXT: $rdi = COPY killed [[COPY2]]
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @foo, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[MOV64rm]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[MOV64rm]]
; CHECK-NEXT: INLINEASM_BR &"", 9 /* sideeffect mayload attdialect */, 262190 /* mem:m */, killed [[MOV64rm]], 1, $noreg, 0, $noreg, 13 /* imm */, blockaddress(@test1, %ir-block.loop)
; CHECK-NEXT: JMP_1 %bb.2
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
index e9f529eea7d3f..fe8d131a977ea 100644
--- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
@@ -49,8 +49,8 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 {
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %bb5
; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT: orl $1, %r12d
; CHECK-NEXT: movq %r14, %r15
+; CHECK-NEXT: orl $1, %r12d
; CHECK-NEXT: .LBB0_2: # %bb7
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 70335f834291d..845c6cd6f2454 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -328,9 +328,9 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; SSE: # %bb.0: # %start
; SSE-NEXT: movd %esi, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: .p2align 4
; SSE-NEXT: .LBB7_1: # %loop
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
@@ -469,11 +469,11 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; SSE-NEXT: movslq %esi, %rax
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: .p2align 4
; SSE-NEXT: .LBB8_1: # %loop
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index 0486c1c4d28e9..ca1e676e91f35 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -14,7 +14,7 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) {
; SSE-NEXT: testl %edx, %edx
; SSE-NEXT: jne .LBB0_1
; SSE-NEXT: # %bb.3:
-; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN]
; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: retq
; SSE-NEXT: .LBB0_1:
@@ -61,7 +61,7 @@ define fp128 @test_select_cc(fp128, fp128) {
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: jmp .LBB1_3
; SSE-NEXT: .LBB1_1:
-; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0]
; SSE-NEXT: .LBB1_3: # %BB0
; SSE-NEXT: testl %ebx, %ebx
; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -103,17 +103,18 @@ define fp128 @test_select_cc(fp128, fp128) {
; NOSSE-NEXT: movq %r12, %rdx
; NOSSE-NEXT: movq %r15, %rcx
; NOSSE-NEXT: callq __eqtf2 at PLT
-; NOSSE-NEXT: movl %eax, %ecx
-; NOSSE-NEXT: xorl %eax, %eax
-; NOSSE-NEXT: testl %ecx, %ecx
-; NOSSE-NEXT: movabsq $4611404543450677248, %rdx # imm = 0x3FFF000000000000
-; NOSSE-NEXT: cmovneq %rax, %rdx
+; NOSSE-NEXT: xorl %ecx, %ecx
+; NOSSE-NEXT: testl %eax, %eax
+; NOSSE-NEXT: movabsq $4611404543450677248, %rax # imm = 0x3FFF000000000000
+; NOSSE-NEXT: cmovneq %rcx, %rax
; NOSSE-NEXT: testl %ebp, %ebp
-; NOSSE-NEXT: je .LBB1_2
-; NOSSE-NEXT: # %bb.1:
+; NOSSE-NEXT: jne .LBB1_2
+; NOSSE-NEXT: # %bb.1: # %BB1
+; NOSSE-NEXT: xorl %r14d, %r14d
+; NOSSE-NEXT: movq %rax, %rbx
+; NOSSE-NEXT: .LBB1_2: # %BB2
; NOSSE-NEXT: movq %r14, %rax
; NOSSE-NEXT: movq %rbx, %rdx
-; NOSSE-NEXT: .LBB1_2: # %BB2
; NOSSE-NEXT: popq %rbx
; NOSSE-NEXT: .cfi_def_cfa_offset 40
; NOSSE-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c307a5759..6ca9e77b9a555 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -196,10 +196,10 @@ define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readon
; SSE2-LABEL: _Z10test_shortPsS_i_512:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -210,19 +210,19 @@ define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readon
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3
; SSE2-NEXT: pmaddwd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: _Z10test_shortPsS_i_512:
@@ -397,9 +397,9 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movl %edx, %eax
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4
; AVX1-NEXT: .LBB3_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
@@ -445,8 +445,8 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB3_1: # %vector.body
@@ -454,16 +454,16 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3
; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4
; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
+; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB3_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -770,10 +770,10 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly
; SSE2-LABEL: _Z9test_charPcS_i_512:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4
; SSE2-NEXT: .LBB6_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -790,19 +790,19 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm4
; SSE2-NEXT: pmaddwd %xmm3, %xmm4
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB6_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: _Z9test_charPcS_i_512:
@@ -993,9 +993,9 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movl %edx, %eax
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4
; AVX1-NEXT: .LBB7_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
@@ -1045,8 +1045,8 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB7_1: # %vector.body
@@ -1055,17 +1055,17 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: addq $32, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB7_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1271,19 +1271,19 @@ define i32 @test_unsigned_short_256(ptr nocapture readonly, ptr nocapture readon
; SSE2-NEXT: pmullw %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: addq $16, %rcx
; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB9_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_unsigned_short_256:
@@ -1442,22 +1442,22 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: addq $16, %rcx
; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB10_1
; AVX1-NEXT: # %bb.2: # %middle.block
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -1480,15 +1480,15 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB10_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1765,15 +1765,15 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2
-; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2
-; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512-NEXT: addq $16, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB11_1
; AVX512-NEXT: # %bb.2: # %middle.block
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -2739,9 +2739,9 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %esi, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: .p2align 4
; SSE2-NEXT: .LBB33_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -3101,10 +3101,10 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: addq %rdx, %rdi
; SSE2-NEXT: addq %rcx, %rsi
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4
; SSE2-NEXT: .LBB38_1: # %loop
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -3121,19 +3121,19 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm3
; SSE2-NEXT: pmaddwd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: addq $16, %rax
; SSE2-NEXT: cmpq %r8, %rax
; SSE2-NEXT: jb .LBB38_1
; SSE2-NEXT: # %bb.2: # %afterloop
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: add_used_by_loop_phi:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 8c4bab99a5b7b..76a94f825266a 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -226,12 +226,11 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double
define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
; SSE-LABEL: load_v4f64_v4i32_zero:
; SSE: ## %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE-NEXT: movmskps %xmm1, %eax
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: testb $1, %al
-; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: jne LBB3_1
; SSE-NEXT: ## %bb.2: ## %else
; SSE-NEXT: testb $2, %al
@@ -245,7 +244,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
; SSE-NEXT: LBB3_8: ## %else8
; SSE-NEXT: retq
; SSE-NEXT: LBB3_1: ## %cond.load
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB3_4
; SSE-NEXT: LBB3_3: ## %cond.load1
@@ -1096,9 +1095,9 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: jne LBB10_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
@@ -1175,9 +1174,9 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE42-NEXT: psllw $15, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: xorps %xmm1, %xmm1
+; SSE42-NEXT: testb $1, %al
+; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: jne LBB10_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
@@ -2614,9 +2613,9 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: jne LBB20_1
; SSE2-NEXT: ## %bb.2: ## %else
; SSE2-NEXT: testb $2, %al
@@ -2693,9 +2692,9 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE42-NEXT: psllw $15, %xmm0
; SSE42-NEXT: packsswb %xmm0, %xmm0
; SSE42-NEXT: pmovmskb %xmm0, %eax
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: testb $1, %al
+; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: jne LBB20_1
; SSE42-NEXT: ## %bb.2: ## %else
; SSE42-NEXT: testb $2, %al
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index d752659f94a50..0ebe7d4ea588d 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -174,8 +174,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
; CHECK-SKX: # %bb.0: # %entry
; CHECK-SKX-NEXT: movl %edx, %eax
; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-SKX-NEXT: xorl %ecx, %ecx
; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-SKX-NEXT: xorl %ecx, %ecx
; CHECK-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-SKX-NEXT: .p2align 4
; CHECK-SKX-NEXT: .LBB8_1: # %vector.body
@@ -184,17 +184,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
-; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; CHECK-SKX-NEXT: addq $32, %rcx
; CHECK-SKX-NEXT: cmpq %rcx, %rax
; CHECK-SKX-NEXT: jne .LBB8_1
; CHECK-SKX-NEXT: # %bb.2: # %middle.block
-; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm2
; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -209,8 +209,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
; CHECK-AVX512: # %bb.0: # %entry
; CHECK-AVX512-NEXT: movl %edx, %eax
; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-AVX512-NEXT: xorl %ecx, %ecx
; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512-NEXT: xorl %ecx, %ecx
; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX512-NEXT: .p2align 4
; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body
@@ -219,17 +219,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
-; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; CHECK-AVX512-NEXT: addq $32, %rcx
; CHECK-AVX512-NEXT: cmpq %rcx, %rax
; CHECK-AVX512-NEXT: jne .LBB8_1
; CHECK-AVX512-NEXT: # %bb.2: # %middle.block
-; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm2
; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -244,8 +244,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
; CHECK-VBMI: # %bb.0: # %entry
; CHECK-VBMI-NEXT: movl %edx, %eax
; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-VBMI-NEXT: xorl %ecx, %ecx
; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-VBMI-NEXT: xorl %ecx, %ecx
; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-VBMI-NEXT: .p2align 4
; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body
@@ -254,17 +254,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
-; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; CHECK-VBMI-NEXT: addq $32, %rcx
; CHECK-VBMI-NEXT: cmpq %rcx, %rax
; CHECK-VBMI-NEXT: jne .LBB8_1
; CHECK-VBMI-NEXT: # %bb.2: # %middle.block
-; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm2
; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
index 672ebc1ec7275..c84bafa612948 100644
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -689,11 +689,12 @@ define void @atomic8_nand_monotonic(ptr %a) {
; O0-NEXT: orb $-43, %dl
; O0-NEXT: .Lpcsection19:
; O0-NEXT: lock cmpxchgb %dl, (%rcx)
+; O0-NEXT: movb %al, %cl
; O0-NEXT: .Lpcsection20:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; O0-NEXT: .Lpcsection21:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection22:
; O0-NEXT: jne .LBB16_2
; O0-NEXT: jmp .LBB16_1
@@ -1027,11 +1028,12 @@ define void @atomic8_nand_acquire(ptr %a) {
; O0-NEXT: orb $-43, %dl
; O0-NEXT: .Lpcsection32:
; O0-NEXT: lock cmpxchgb %dl, (%rcx)
+; O0-NEXT: movb %al, %cl
; O0-NEXT: .Lpcsection33:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; O0-NEXT: .Lpcsection34:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection35:
; O0-NEXT: jne .LBB23_2
; O0-NEXT: jmp .LBB23_1
@@ -1365,11 +1367,12 @@ define void @atomic8_nand_release(ptr %a) {
; O0-NEXT: orb $-43, %dl
; O0-NEXT: .Lpcsection45:
; O0-NEXT: lock cmpxchgb %dl, (%rcx)
+; O0-NEXT: movb %al, %cl
; O0-NEXT: .Lpcsection46:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; O0-NEXT: .Lpcsection47:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection48:
; O0-NEXT: jne .LBB30_2
; O0-NEXT: jmp .LBB30_1
@@ -1703,11 +1706,12 @@ define void @atomic8_nand_acq_rel(ptr %a) {
; O0-NEXT: orb $-43, %dl
; O0-NEXT: .Lpcsection58:
; O0-NEXT: lock cmpxchgb %dl, (%rcx)
+; O0-NEXT: movb %al, %cl
; O0-NEXT: .Lpcsection59:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; O0-NEXT: .Lpcsection60:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection61:
; O0-NEXT: jne .LBB37_2
; O0-NEXT: jmp .LBB37_1
@@ -2041,11 +2045,12 @@ define void @atomic8_nand_seq_cst(ptr %a) {
; O0-NEXT: orb $-43, %dl
; O0-NEXT: .Lpcsection71:
; O0-NEXT: lock cmpxchgb %dl, (%rcx)
+; O0-NEXT: movb %al, %cl
; O0-NEXT: .Lpcsection72:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; O0-NEXT: .Lpcsection73:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection74:
; O0-NEXT: jne .LBB44_2
; O0-NEXT: jmp .LBB44_1
@@ -3134,11 +3139,12 @@ define void @atomic16_nand_monotonic(ptr %a) {
; O0-NEXT: # kill: def $dx killed $dx killed $edx
; O0-NEXT: .Lpcsection119:
; O0-NEXT: lock cmpxchgw %dx, (%rcx)
+; O0-NEXT: movw %ax, %cx
; O0-NEXT: .Lpcsection120:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: .Lpcsection121:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection122:
; O0-NEXT: jne .LBB64_2
; O0-NEXT: jmp .LBB64_1
@@ -3488,11 +3494,12 @@ define void @atomic16_nand_acquire(ptr %a) {
; O0-NEXT: # kill: def $dx killed $dx killed $edx
; O0-NEXT: .Lpcsection134:
; O0-NEXT: lock cmpxchgw %dx, (%rcx)
+; O0-NEXT: movw %ax, %cx
; O0-NEXT: .Lpcsection135:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: .Lpcsection136:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection137:
; O0-NEXT: jne .LBB71_2
; O0-NEXT: jmp .LBB71_1
@@ -3842,11 +3849,12 @@ define void @atomic16_nand_release(ptr %a) {
; O0-NEXT: # kill: def $dx killed $dx killed $edx
; O0-NEXT: .Lpcsection149:
; O0-NEXT: lock cmpxchgw %dx, (%rcx)
+; O0-NEXT: movw %ax, %cx
; O0-NEXT: .Lpcsection150:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: .Lpcsection151:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection152:
; O0-NEXT: jne .LBB78_2
; O0-NEXT: jmp .LBB78_1
@@ -4196,11 +4204,12 @@ define void @atomic16_nand_acq_rel(ptr %a) {
; O0-NEXT: # kill: def $dx killed $dx killed $edx
; O0-NEXT: .Lpcsection164:
; O0-NEXT: lock cmpxchgw %dx, (%rcx)
+; O0-NEXT: movw %ax, %cx
; O0-NEXT: .Lpcsection165:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: .Lpcsection166:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection167:
; O0-NEXT: jne .LBB85_2
; O0-NEXT: jmp .LBB85_1
@@ -4550,11 +4559,12 @@ define void @atomic16_nand_seq_cst(ptr %a) {
; O0-NEXT: # kill: def $dx killed $dx killed $edx
; O0-NEXT: .Lpcsection179:
; O0-NEXT: lock cmpxchgw %dx, (%rcx)
+; O0-NEXT: movw %ax, %cx
; O0-NEXT: .Lpcsection180:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; O0-NEXT: .Lpcsection181:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection182:
; O0-NEXT: jne .LBB92_2
; O0-NEXT: jmp .LBB92_1
@@ -5606,11 +5616,12 @@ define void @atomic32_nand_monotonic(ptr %a) {
; O0-NEXT: orl $-43, %edx
; O0-NEXT: .Lpcsection225:
; O0-NEXT: lock cmpxchgl %edx, (%rcx)
+; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection226:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; O0-NEXT: .Lpcsection227:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection228:
; O0-NEXT: jne .LBB112_2
; O0-NEXT: jmp .LBB112_1
@@ -5944,11 +5955,12 @@ define void @atomic32_nand_acquire(ptr %a) {
; O0-NEXT: orl $-43, %edx
; O0-NEXT: .Lpcsection238:
; O0-NEXT: lock cmpxchgl %edx, (%rcx)
+; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection239:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; O0-NEXT: .Lpcsection240:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection241:
; O0-NEXT: jne .LBB119_2
; O0-NEXT: jmp .LBB119_1
@@ -6282,11 +6294,12 @@ define void @atomic32_nand_release(ptr %a) {
; O0-NEXT: orl $-43, %edx
; O0-NEXT: .Lpcsection251:
; O0-NEXT: lock cmpxchgl %edx, (%rcx)
+; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection252:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; O0-NEXT: .Lpcsection253:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection254:
; O0-NEXT: jne .LBB126_2
; O0-NEXT: jmp .LBB126_1
@@ -6620,11 +6633,12 @@ define void @atomic32_nand_acq_rel(ptr %a) {
; O0-NEXT: orl $-43, %edx
; O0-NEXT: .Lpcsection264:
; O0-NEXT: lock cmpxchgl %edx, (%rcx)
+; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection265:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; O0-NEXT: .Lpcsection266:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection267:
; O0-NEXT: jne .LBB133_2
; O0-NEXT: jmp .LBB133_1
@@ -6958,11 +6972,12 @@ define void @atomic32_nand_seq_cst(ptr %a) {
; O0-NEXT: orl $-43, %edx
; O0-NEXT: .Lpcsection277:
; O0-NEXT: lock cmpxchgl %edx, (%rcx)
+; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection278:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; O0-NEXT: .Lpcsection279:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection280:
; O0-NEXT: jne .LBB140_2
; O0-NEXT: jmp .LBB140_1
@@ -8128,11 +8143,12 @@ define void @atomic64_nand_monotonic(ptr %a) {
; O0-NEXT: orq $-43, %rdx
; O0-NEXT: .Lpcsection326:
; O0-NEXT: lock cmpxchgq %rdx, (%rcx)
+; O0-NEXT: movq %rax, %rcx
; O0-NEXT: .Lpcsection327:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection328:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection329:
; O0-NEXT: jne .LBB162_2
; O0-NEXT: jmp .LBB162_1
@@ -8469,11 +8485,12 @@ define void @atomic64_nand_acquire(ptr %a) {
; O0-NEXT: orq $-43, %rdx
; O0-NEXT: .Lpcsection340:
; O0-NEXT: lock cmpxchgq %rdx, (%rcx)
+; O0-NEXT: movq %rax, %rcx
; O0-NEXT: .Lpcsection341:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection342:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection343:
; O0-NEXT: jne .LBB169_2
; O0-NEXT: jmp .LBB169_1
@@ -8810,11 +8827,12 @@ define void @atomic64_nand_release(ptr %a) {
; O0-NEXT: orq $-43, %rdx
; O0-NEXT: .Lpcsection354:
; O0-NEXT: lock cmpxchgq %rdx, (%rcx)
+; O0-NEXT: movq %rax, %rcx
; O0-NEXT: .Lpcsection355:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection356:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection357:
; O0-NEXT: jne .LBB176_2
; O0-NEXT: jmp .LBB176_1
@@ -9151,11 +9169,12 @@ define void @atomic64_nand_acq_rel(ptr %a) {
; O0-NEXT: orq $-43, %rdx
; O0-NEXT: .Lpcsection368:
; O0-NEXT: lock cmpxchgq %rdx, (%rcx)
+; O0-NEXT: movq %rax, %rcx
; O0-NEXT: .Lpcsection369:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection370:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection371:
; O0-NEXT: jne .LBB183_2
; O0-NEXT: jmp .LBB183_1
@@ -9492,11 +9511,12 @@ define void @atomic64_nand_seq_cst(ptr %a) {
; O0-NEXT: orq $-43, %rdx
; O0-NEXT: .Lpcsection382:
; O0-NEXT: lock cmpxchgq %rdx, (%rcx)
+; O0-NEXT: movq %rax, %rcx
; O0-NEXT: .Lpcsection383:
-; O0-NEXT: sete %cl
+; O0-NEXT: sete %al
+; O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection384:
-; O0-NEXT: testb $1, %cl
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: testb $1, %al
; O0-NEXT: .Lpcsection385:
; O0-NEXT: jne .LBB190_2
; O0-NEXT: jmp .LBB190_1
@@ -10527,8 +10547,8 @@ define void @atomic128_store_unordered(ptr %a) {
; O0-NEXT: jmp .LBB203_1
; O0-NEXT: .LBB203_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection426:
; O0-NEXT: xorl %ecx, %ecx
@@ -10538,8 +10558,8 @@ define void @atomic128_store_unordered(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection429:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection430:
; O0-NEXT: jne .LBB203_1
; O0-NEXT: jmp .LBB203_2
@@ -10654,8 +10674,8 @@ define void @atomic128_store_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB204_1
; O0-NEXT: .LBB204_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection434:
; O0-NEXT: xorl %ecx, %ecx
@@ -10665,8 +10685,8 @@ define void @atomic128_store_monotonic(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection437:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection438:
; O0-NEXT: jne .LBB204_1
; O0-NEXT: jmp .LBB204_2
@@ -10781,8 +10801,8 @@ define void @atomic128_store_release(ptr %a) {
; O0-NEXT: jmp .LBB205_1
; O0-NEXT: .LBB205_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection442:
; O0-NEXT: xorl %ecx, %ecx
@@ -10792,8 +10812,8 @@ define void @atomic128_store_release(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection445:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection446:
; O0-NEXT: jne .LBB205_1
; O0-NEXT: jmp .LBB205_2
@@ -10908,8 +10928,8 @@ define void @atomic128_store_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB206_1
; O0-NEXT: .LBB206_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection450:
; O0-NEXT: xorl %ecx, %ecx
@@ -10919,8 +10939,8 @@ define void @atomic128_store_seq_cst(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection453:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection454:
; O0-NEXT: jne .LBB206_1
; O0-NEXT: jmp .LBB206_2
@@ -11074,8 +11094,8 @@ define void @atomic128_xchg_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB208_1
; O0-NEXT: .LBB208_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection459:
; O0-NEXT: xorl %ecx, %ecx
@@ -11085,8 +11105,8 @@ define void @atomic128_xchg_monotonic(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection462:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection463:
; O0-NEXT: jne .LBB208_1
; O0-NEXT: jmp .LBB208_2
@@ -11201,8 +11221,8 @@ define void @atomic128_add_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB209_1
; O0-NEXT: .LBB209_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection467:
@@ -11212,8 +11232,8 @@ define void @atomic128_add_monotonic(ptr %a) {
; O0-NEXT: adcq $0, %rcx
; O0-NEXT: .Lpcsection469:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection470:
; O0-NEXT: jne .LBB209_1
; O0-NEXT: jmp .LBB209_2
@@ -11334,8 +11354,8 @@ define void @atomic128_sub_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB210_1
; O0-NEXT: .LBB210_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection474:
@@ -11345,8 +11365,8 @@ define void @atomic128_sub_monotonic(ptr %a) {
; O0-NEXT: adcq $-1, %rcx
; O0-NEXT: .Lpcsection476:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection477:
; O0-NEXT: jne .LBB210_1
; O0-NEXT: jmp .LBB210_2
@@ -11467,8 +11487,8 @@ define void @atomic128_and_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB211_1
; O0-NEXT: .LBB211_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection481:
@@ -11480,8 +11500,8 @@ define void @atomic128_and_monotonic(ptr %a) {
; O0-NEXT: # kill: def $rcx killed $ecx
; O0-NEXT: .Lpcsection484:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection485:
; O0-NEXT: jne .LBB211_1
; O0-NEXT: jmp .LBB211_2
@@ -11599,8 +11619,8 @@ define void @atomic128_or_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB212_1
; O0-NEXT: .LBB212_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection489:
@@ -11608,8 +11628,8 @@ define void @atomic128_or_monotonic(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection490:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection491:
; O0-NEXT: jne .LBB212_1
; O0-NEXT: jmp .LBB212_2
@@ -11724,8 +11744,8 @@ define void @atomic128_xor_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB213_1
; O0-NEXT: .LBB213_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection495:
@@ -11733,8 +11753,8 @@ define void @atomic128_xor_monotonic(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection496:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection497:
; O0-NEXT: jne .LBB213_1
; O0-NEXT: jmp .LBB213_2
@@ -11849,8 +11869,8 @@ define void @atomic128_nand_monotonic(ptr %a) {
; O0-NEXT: jmp .LBB214_1
; O0-NEXT: .LBB214_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection501:
@@ -11864,8 +11884,8 @@ define void @atomic128_nand_monotonic(ptr %a) {
; O0-NEXT: movq $-1, %rcx
; O0-NEXT: .Lpcsection505:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection506:
; O0-NEXT: jne .LBB214_1
; O0-NEXT: jmp .LBB214_2
@@ -11989,8 +12009,8 @@ define void @atomic128_xchg_acquire(ptr %a) {
; O0-NEXT: jmp .LBB215_1
; O0-NEXT: .LBB215_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection510:
; O0-NEXT: xorl %ecx, %ecx
@@ -12000,8 +12020,8 @@ define void @atomic128_xchg_acquire(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection513:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection514:
; O0-NEXT: jne .LBB215_1
; O0-NEXT: jmp .LBB215_2
@@ -12116,8 +12136,8 @@ define void @atomic128_add_acquire(ptr %a) {
; O0-NEXT: jmp .LBB216_1
; O0-NEXT: .LBB216_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection518:
@@ -12127,8 +12147,8 @@ define void @atomic128_add_acquire(ptr %a) {
; O0-NEXT: adcq $0, %rcx
; O0-NEXT: .Lpcsection520:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection521:
; O0-NEXT: jne .LBB216_1
; O0-NEXT: jmp .LBB216_2
@@ -12249,8 +12269,8 @@ define void @atomic128_sub_acquire(ptr %a) {
; O0-NEXT: jmp .LBB217_1
; O0-NEXT: .LBB217_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection525:
@@ -12260,8 +12280,8 @@ define void @atomic128_sub_acquire(ptr %a) {
; O0-NEXT: adcq $-1, %rcx
; O0-NEXT: .Lpcsection527:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection528:
; O0-NEXT: jne .LBB217_1
; O0-NEXT: jmp .LBB217_2
@@ -12382,8 +12402,8 @@ define void @atomic128_and_acquire(ptr %a) {
; O0-NEXT: jmp .LBB218_1
; O0-NEXT: .LBB218_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection532:
@@ -12395,8 +12415,8 @@ define void @atomic128_and_acquire(ptr %a) {
; O0-NEXT: # kill: def $rcx killed $ecx
; O0-NEXT: .Lpcsection535:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection536:
; O0-NEXT: jne .LBB218_1
; O0-NEXT: jmp .LBB218_2
@@ -12514,8 +12534,8 @@ define void @atomic128_or_acquire(ptr %a) {
; O0-NEXT: jmp .LBB219_1
; O0-NEXT: .LBB219_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection540:
@@ -12523,8 +12543,8 @@ define void @atomic128_or_acquire(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection541:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection542:
; O0-NEXT: jne .LBB219_1
; O0-NEXT: jmp .LBB219_2
@@ -12639,8 +12659,8 @@ define void @atomic128_xor_acquire(ptr %a) {
; O0-NEXT: jmp .LBB220_1
; O0-NEXT: .LBB220_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection546:
@@ -12648,8 +12668,8 @@ define void @atomic128_xor_acquire(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection547:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection548:
; O0-NEXT: jne .LBB220_1
; O0-NEXT: jmp .LBB220_2
@@ -12764,8 +12784,8 @@ define void @atomic128_nand_acquire(ptr %a) {
; O0-NEXT: jmp .LBB221_1
; O0-NEXT: .LBB221_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection552:
@@ -12779,8 +12799,8 @@ define void @atomic128_nand_acquire(ptr %a) {
; O0-NEXT: movq $-1, %rcx
; O0-NEXT: .Lpcsection556:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection557:
; O0-NEXT: jne .LBB221_1
; O0-NEXT: jmp .LBB221_2
@@ -12904,8 +12924,8 @@ define void @atomic128_xchg_release(ptr %a) {
; O0-NEXT: jmp .LBB222_1
; O0-NEXT: .LBB222_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection561:
; O0-NEXT: xorl %ecx, %ecx
@@ -12915,8 +12935,8 @@ define void @atomic128_xchg_release(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection564:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection565:
; O0-NEXT: jne .LBB222_1
; O0-NEXT: jmp .LBB222_2
@@ -13030,8 +13050,8 @@ define void @atomic128_add_release(ptr %a) {
; O0-NEXT: jmp .LBB223_1
; O0-NEXT: .LBB223_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection569:
@@ -13041,8 +13061,8 @@ define void @atomic128_add_release(ptr %a) {
; O0-NEXT: adcq $0, %rcx
; O0-NEXT: .Lpcsection571:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection572:
; O0-NEXT: jne .LBB223_1
; O0-NEXT: jmp .LBB223_2
@@ -13163,8 +13183,8 @@ define void @atomic128_sub_release(ptr %a) {
; O0-NEXT: jmp .LBB224_1
; O0-NEXT: .LBB224_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection576:
@@ -13174,8 +13194,8 @@ define void @atomic128_sub_release(ptr %a) {
; O0-NEXT: adcq $-1, %rcx
; O0-NEXT: .Lpcsection578:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection579:
; O0-NEXT: jne .LBB224_1
; O0-NEXT: jmp .LBB224_2
@@ -13296,8 +13316,8 @@ define void @atomic128_and_release(ptr %a) {
; O0-NEXT: jmp .LBB225_1
; O0-NEXT: .LBB225_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection583:
@@ -13309,8 +13329,8 @@ define void @atomic128_and_release(ptr %a) {
; O0-NEXT: # kill: def $rcx killed $ecx
; O0-NEXT: .Lpcsection586:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection587:
; O0-NEXT: jne .LBB225_1
; O0-NEXT: jmp .LBB225_2
@@ -13428,8 +13448,8 @@ define void @atomic128_or_release(ptr %a) {
; O0-NEXT: jmp .LBB226_1
; O0-NEXT: .LBB226_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection591:
@@ -13437,8 +13457,8 @@ define void @atomic128_or_release(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection592:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection593:
; O0-NEXT: jne .LBB226_1
; O0-NEXT: jmp .LBB226_2
@@ -13553,8 +13573,8 @@ define void @atomic128_xor_release(ptr %a) {
; O0-NEXT: jmp .LBB227_1
; O0-NEXT: .LBB227_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection597:
@@ -13562,8 +13582,8 @@ define void @atomic128_xor_release(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection598:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection599:
; O0-NEXT: jne .LBB227_1
; O0-NEXT: jmp .LBB227_2
@@ -13678,8 +13698,8 @@ define void @atomic128_nand_release(ptr %a) {
; O0-NEXT: jmp .LBB228_1
; O0-NEXT: .LBB228_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection603:
@@ -13693,8 +13713,8 @@ define void @atomic128_nand_release(ptr %a) {
; O0-NEXT: movq $-1, %rcx
; O0-NEXT: .Lpcsection607:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection608:
; O0-NEXT: jne .LBB228_1
; O0-NEXT: jmp .LBB228_2
@@ -13818,8 +13838,8 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB229_1
; O0-NEXT: .LBB229_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection612:
; O0-NEXT: xorl %ecx, %ecx
@@ -13829,8 +13849,8 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection615:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection616:
; O0-NEXT: jne .LBB229_1
; O0-NEXT: jmp .LBB229_2
@@ -13945,8 +13965,8 @@ define void @atomic128_add_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB230_1
; O0-NEXT: .LBB230_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection620:
@@ -13956,8 +13976,8 @@ define void @atomic128_add_acq_rel(ptr %a) {
; O0-NEXT: adcq $0, %rcx
; O0-NEXT: .Lpcsection622:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection623:
; O0-NEXT: jne .LBB230_1
; O0-NEXT: jmp .LBB230_2
@@ -14078,8 +14098,8 @@ define void @atomic128_sub_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB231_1
; O0-NEXT: .LBB231_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection627:
@@ -14089,8 +14109,8 @@ define void @atomic128_sub_acq_rel(ptr %a) {
; O0-NEXT: adcq $-1, %rcx
; O0-NEXT: .Lpcsection629:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection630:
; O0-NEXT: jne .LBB231_1
; O0-NEXT: jmp .LBB231_2
@@ -14211,8 +14231,8 @@ define void @atomic128_and_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB232_1
; O0-NEXT: .LBB232_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection634:
@@ -14224,8 +14244,8 @@ define void @atomic128_and_acq_rel(ptr %a) {
; O0-NEXT: # kill: def $rcx killed $ecx
; O0-NEXT: .Lpcsection637:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection638:
; O0-NEXT: jne .LBB232_1
; O0-NEXT: jmp .LBB232_2
@@ -14343,8 +14363,8 @@ define void @atomic128_or_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB233_1
; O0-NEXT: .LBB233_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection642:
@@ -14352,8 +14372,8 @@ define void @atomic128_or_acq_rel(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection643:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection644:
; O0-NEXT: jne .LBB233_1
; O0-NEXT: jmp .LBB233_2
@@ -14468,8 +14488,8 @@ define void @atomic128_xor_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB234_1
; O0-NEXT: .LBB234_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection648:
@@ -14477,8 +14497,8 @@ define void @atomic128_xor_acq_rel(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection649:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection650:
; O0-NEXT: jne .LBB234_1
; O0-NEXT: jmp .LBB234_2
@@ -14593,8 +14613,8 @@ define void @atomic128_nand_acq_rel(ptr %a) {
; O0-NEXT: jmp .LBB235_1
; O0-NEXT: .LBB235_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection654:
@@ -14608,8 +14628,8 @@ define void @atomic128_nand_acq_rel(ptr %a) {
; O0-NEXT: movq $-1, %rcx
; O0-NEXT: .Lpcsection658:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection659:
; O0-NEXT: jne .LBB235_1
; O0-NEXT: jmp .LBB235_2
@@ -14733,8 +14753,8 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB236_1
; O0-NEXT: .LBB236_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: .Lpcsection663:
; O0-NEXT: xorl %ecx, %ecx
@@ -14744,8 +14764,8 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
; O0-NEXT: movl $42, %ebx
; O0-NEXT: .Lpcsection666:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection667:
; O0-NEXT: jne .LBB236_1
; O0-NEXT: jmp .LBB236_2
@@ -14860,8 +14880,8 @@ define void @atomic128_add_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB237_1
; O0-NEXT: .LBB237_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection671:
@@ -14871,8 +14891,8 @@ define void @atomic128_add_seq_cst(ptr %a) {
; O0-NEXT: adcq $0, %rcx
; O0-NEXT: .Lpcsection673:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection674:
; O0-NEXT: jne .LBB237_1
; O0-NEXT: jmp .LBB237_2
@@ -14993,8 +15013,8 @@ define void @atomic128_sub_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB238_1
; O0-NEXT: .LBB238_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection678:
@@ -15004,8 +15024,8 @@ define void @atomic128_sub_seq_cst(ptr %a) {
; O0-NEXT: adcq $-1, %rcx
; O0-NEXT: .Lpcsection680:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection681:
; O0-NEXT: jne .LBB238_1
; O0-NEXT: jmp .LBB238_2
@@ -15126,8 +15146,8 @@ define void @atomic128_and_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB239_1
; O0-NEXT: .LBB239_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection685:
@@ -15139,8 +15159,8 @@ define void @atomic128_and_seq_cst(ptr %a) {
; O0-NEXT: # kill: def $rcx killed $ecx
; O0-NEXT: .Lpcsection688:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection689:
; O0-NEXT: jne .LBB239_1
; O0-NEXT: jmp .LBB239_2
@@ -15258,8 +15278,8 @@ define void @atomic128_or_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB240_1
; O0-NEXT: .LBB240_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection693:
@@ -15267,8 +15287,8 @@ define void @atomic128_or_seq_cst(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection694:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection695:
; O0-NEXT: jne .LBB240_1
; O0-NEXT: jmp .LBB240_2
@@ -15383,8 +15403,8 @@ define void @atomic128_xor_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB241_1
; O0-NEXT: .LBB241_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movq %rax, %rbx
; O0-NEXT: .Lpcsection699:
@@ -15392,8 +15412,8 @@ define void @atomic128_xor_seq_cst(ptr %a) {
; O0-NEXT: movq %rcx, %rdx
; O0-NEXT: .Lpcsection700:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection701:
; O0-NEXT: jne .LBB241_1
; O0-NEXT: jmp .LBB241_2
@@ -15508,8 +15528,8 @@ define void @atomic128_nand_seq_cst(ptr %a) {
; O0-NEXT: jmp .LBB242_1
; O0-NEXT: .LBB242_1: # %atomicrmw.start
; O0-NEXT: # =>This Inner Loop Header: Depth=1
-; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; O0-NEXT: movl %eax, %ecx
; O0-NEXT: .Lpcsection705:
@@ -15523,8 +15543,8 @@ define void @atomic128_nand_seq_cst(ptr %a) {
; O0-NEXT: movq $-1, %rcx
; O0-NEXT: .Lpcsection709:
; O0-NEXT: lock cmpxchg16b (%rsi)
-; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; O0-NEXT: .Lpcsection710:
; O0-NEXT: jne .LBB242_1
; O0-NEXT: jmp .LBB242_2
diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index 3dd4aabe25b57..bbbeb53764686 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -5,18 +5,19 @@
define i32 @PR15705(i32 %x, i32 %a, i32 %b, i32 %c) #0 {
; X86-LABEL: PR15705:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl %ecx, %edx
-; X86-NEXT: je .LBB0_4
-; X86-NEXT: # %bb.1: # %if.end
; X86-NEXT: cmpl %eax, %edx
-; X86-NEXT: jne .LBB0_3
-; X86-NEXT: # %bb.2:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: .LBB0_3: # %if.end
+; X86-NEXT: jne .LBB0_2
+; X86-NEXT: # %bb.1:
; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: retl
+; X86-NEXT: .LBB0_2: # %if.end
+; X86-NEXT: cmpl %ecx, %edx
+; X86-NEXT: jne .LBB0_4
+; X86-NEXT: # %bb.3:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: .LBB0_4: # %return
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/pr32256.ll b/llvm/test/CodeGen/X86/pr32256.ll
index 225a3af551a2c..09f7d92c1db2e 100644
--- a/llvm/test/CodeGen/X86/pr32256.ll
+++ b/llvm/test/CodeGen/X86/pr32256.ll
@@ -9,12 +9,12 @@ define void @_Z1av() {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subl $2, %esp
; CHECK-NEXT: .cfi_def_cfa_offset 6
-; CHECK-NEXT: movb c, %cl
-; CHECK-NEXT: xorb $-1, %cl
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: testb $1, %cl
-; CHECK-NEXT: movb %al, (%esp) # 1-byte Spill
+; CHECK-NEXT: movb c, %al
+; CHECK-NEXT: xorb $-1, %al
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill
+; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: jmp .LBB0_2
; CHECK-NEXT: .LBB0_1: # %land.rhs
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index c3c96e8228797..02dbe4f545fe5 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -33,6 +33,7 @@ define dso_local void @fn() {
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_15: # %for.inc
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; CHECK-NEXT: .LBB0_1: # %for.cond
; CHECK-NEXT: # =>This Loop Header: Depth=1
@@ -69,7 +70,6 @@ define dso_local void @fn() {
; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
; CHECK-NEXT: testb %bl, %bl
-; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: # implicit-def: $eax
; CHECK-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; CHECK-NEXT: jne .LBB0_15
@@ -119,7 +119,7 @@ define dso_local void @fn() {
; CHECK-NEXT: jne .LBB0_9
; CHECK-NEXT: # %bb.12: # %if.end26
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: testb %dh, %dh
; CHECK-NEXT: je .LBB0_15
; CHECK-NEXT: # %bb.13: # %if.end26
@@ -128,7 +128,7 @@ define dso_local void @fn() {
; CHECK-NEXT: jne .LBB0_15
; CHECK-NEXT: # %bb.14: # %if.then31
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: jmp .LBB0_15
; CHECK-NEXT: .p2align 4
@@ -279,31 +279,34 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
; CHECK-NEXT: je .LBB1_3
; CHECK-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: movl %ebx, %edx
; CHECK-NEXT: jmp .LBB1_5
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_3: # %if.end
; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je .LBB1_4
; CHECK-NEXT: # %bb.9: # %if.then13
; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: movl $0, %ebx
; CHECK-NEXT: jne .LBB1_8
+; CHECK-NEXT: # %bb.10: # %for.cond35
+; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: movl %ebx, %edx
; CHECK-NEXT: jmp .LBB1_5
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: movl %ebx, %eax
-; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: .LBB1_5: # %if.end26
; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: je .LBB1_7
; CHECK-NEXT: # %bb.6: # %if.end26
; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT: movl %ebx, %ecx
+; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: jmp .LBB1_7
entry:
br label %for.cond
diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll
index 173c41140ebef..1a7551f6117e8 100644
--- a/llvm/test/CodeGen/X86/pr49451.ll
+++ b/llvm/test/CodeGen/X86/pr49451.ll
@@ -18,15 +18,15 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB0_1: # %for.body612
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: testb %dl, %dl
+; X86-NEXT: testb %bl, %bl
; X86-NEXT: je .LBB0_2
; X86-NEXT: # %bb.3: # %if.end1401
; X86-NEXT: # in Loop: Header=BB0_1 Depth=1
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movw %si, s_2
-; X86-NEXT: movw %bx, s_0
+; X86-NEXT: movw %dx, s_0
; X86-NEXT: incl %ecx
-; X86-NEXT: incl %ebx
+; X86-NEXT: incl %edx
; X86-NEXT: cmpw $73, %cx
; X86-NEXT: jl .LBB0_1
; X86-NEXT: # %bb.4: # %for.body1703
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index b5b80515fc6d9..8df90a935314d 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -21,7 +21,7 @@ define i32 @PR63108() {
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: testb %al, %al
-; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: jne .LBB0_3
; SSE-NEXT: # %bb.4: # %middle.block
; SSE-NEXT: pxor %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index fe71a16039c19..7cfc88a77dea5 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -151,27 +151,27 @@ define dso_local i32 @sad_32i8() nounwind {
; SSE2-LABEL: sad_32i8:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: .p2align 4
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
; SSE2-NEXT: psadbw b+1024(%rax), %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm2
; SSE2-NEXT: movdqa a+1040(%rax), %xmm3
; SSE2-NEXT: psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: addq $32, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -356,9 +356,9 @@ define dso_local i32 @sad_avx64i8() nounwind {
; AVX1-LABEL: sad_avx64i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4
; AVX1-NEXT: .LBB2_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
@@ -408,26 +408,26 @@ define dso_local i32 @sad_avx64i8() nounwind {
; AVX2-LABEL: sad_avx64i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3
; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3
; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: addq $64, %rax
; AVX2-NEXT: jne .LBB2_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
-; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 325f735b09cd9..b4a6960d144e1 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1266,13 +1266,12 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
; X86-SSE-LABEL: add_ss_mask:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: jne .LBB70_1
-; X86-SSE-NEXT: # %bb.2:
-; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X86-SSE-NEXT: retl
-; X86-SSE-NEXT: .LBB70_1:
+; X86-SSE-NEXT: je .LBB70_2
+; X86-SSE-NEXT: # %bb.1:
; X86-SSE-NEXT: addss %xmm0, %xmm1
-; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE-NEXT: movaps %xmm1, %xmm2
+; X86-SSE-NEXT: .LBB70_2:
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: add_ss_mask:
@@ -1296,13 +1295,12 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
; X64-SSE-LABEL: add_ss_mask:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: testb $1, %dil
-; X64-SSE-NEXT: jne .LBB70_1
-; X64-SSE-NEXT: # %bb.2:
-; X64-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X64-SSE-NEXT: retq
-; X64-SSE-NEXT: .LBB70_1:
+; X64-SSE-NEXT: je .LBB70_2
+; X64-SSE-NEXT: # %bb.1:
; X64-SSE-NEXT: addss %xmm0, %xmm1
-; X64-SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE-NEXT: movaps %xmm1, %xmm2
+; X64-SSE-NEXT: .LBB70_2:
+; X64-SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: add_ss_mask:
@@ -1336,13 +1334,12 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; X86-SSE-LABEL: add_sd_mask:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: jne .LBB71_1
-; X86-SSE-NEXT: # %bb.2:
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE-NEXT: retl
-; X86-SSE-NEXT: .LBB71_1:
+; X86-SSE-NEXT: je .LBB71_2
+; X86-SSE-NEXT: # %bb.1:
; X86-SSE-NEXT: addsd %xmm0, %xmm1
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT: movapd %xmm1, %xmm2
+; X86-SSE-NEXT: .LBB71_2:
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: add_sd_mask:
@@ -1366,13 +1363,12 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; X64-SSE-LABEL: add_sd_mask:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: testb $1, %dil
-; X64-SSE-NEXT: jne .LBB71_1
-; X64-SSE-NEXT: # %bb.2:
-; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X64-SSE-NEXT: retq
-; X64-SSE-NEXT: .LBB71_1:
+; X64-SSE-NEXT: je .LBB71_2
+; X64-SSE-NEXT: # %bb.1:
; X64-SSE-NEXT: addsd %xmm0, %xmm1
-; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE-NEXT: movapd %xmm1, %xmm2
+; X64-SSE-NEXT: .LBB71_2:
+; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: add_sd_mask:
diff --git a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
index 391963de79703..089ec7ae7658e 100644
--- a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
+++ b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
@@ -60,7 +60,7 @@ zero:
; CHECK: JMP_1 %bb.4
; CHECK: bb.4
; CHECK: bb.5
-; CHECK: %3:gr64 = COPY %10
+; CHECK: %20:gr64 = COPY %10
; CHECK: %4:gr64 = COPY killed %10
; CHECK: %4:gr64 = nuw ADD64ri32 %4, 8, implicit-def dead $eflags
; CHECK: TEST64rr killed %1, %1, implicit-def $eflags
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 5699c447baf41..473cc9d152627 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -426,18 +426,19 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
; CHECK-O0-NEXT: jmp LBB4_1
; CHECK-O0-NEXT: LBB4_1: ## %bb_loop
; CHECK-O0-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 4-byte Reload
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; CHECK-O0-NEXT: cmpl $0, %ecx
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; CHECK-O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-O0-NEXT: cmpl $0, %eax
; CHECK-O0-NEXT: je LBB4_3
; CHECK-O0-NEXT: ## %bb.2: ## %gen_error
; CHECK-O0-NEXT: ## in Loop: Header=BB4_1 Depth=1
; CHECK-O0-NEXT: movl $16, %edi
; CHECK-O0-NEXT: callq _malloc
; CHECK-O0-NEXT: movq %rax, %rcx
-; CHECK-O0-NEXT: movb $1, 8(%rcx)
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-O0-NEXT: movq %rcx, %rax
+; CHECK-O0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-O0-NEXT: movb $1, 8(%rax)
; CHECK-O0-NEXT: LBB4_3: ## %bb_cont
; CHECK-O0-NEXT: ## in Loop: Header=BB4_1 Depth=1
; CHECK-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
@@ -964,17 +965,17 @@ define void @swifterror_isel(ptr) {
; CHECK-O0-NEXT: ## implicit-def: $r12
; CHECK-O0-NEXT: jne LBB8_2
; CHECK-O0-NEXT: LBB8_1: ## =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
; CHECK-O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax ## 2-byte Reload
+; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload
; CHECK-O0-NEXT: ## implicit-def: $edi
; CHECK-O0-NEXT: movw %ax, %di
; CHECK-O0-NEXT: ## implicit-def: $rax
; CHECK-O0-NEXT: callq *%rax
+; CHECK-O0-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-O0-NEXT: ## implicit-def: $rax
; CHECK-O0-NEXT: movw (%rax), %ax
; CHECK-O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; CHECK-O0-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; CHECK-O0-NEXT: jmp LBB8_1
; CHECK-O0-NEXT: LBB8_2:
; CHECK-O0-NEXT: addq $40, %rsp
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
index ed04647c84066..31c694ee1e014 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
@@ -70,8 +70,7 @@ frameInfo:
machineFunctionInfo: {}
body: |
; CHECK-LABEL: bb.0:
- ; CHECK: renamable $ebp = COPY $edi
- ; CHECK: MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $ebp
+ ; CHECK: MOV32mr %stack.[[SLOT:.+]], 1, $noreg, 0, $noreg, $edi
bb.0:
successors: %bb.2(0x50000000), %bb.1(0x30000000)
liveins: $edi, $esi
@@ -142,8 +141,8 @@ body: |
%64:gr32 = PHI %24, %bb.0, %44, %bb.1, debug-location !18
DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
- ; CHECK: DBG_PHI %stack.1, 1, 32
- ; CHECK: renamable $eax = MOV32rm %stack.1,
+ ; CHECK: DBG_PHI %stack.[[SLOT]], 1, 32
+ ; CHECK: renamable $eax = MOV32rm %stack.[[SLOT]],
; CHECK: DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
$eax = COPY killed %0, debug-location !19
RET 0, killed $eax, debug-location !19
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
index 5976658ccdf86..f1e443b701bbe 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
@@ -10,23 +10,22 @@ define i32 @i32_initially_postidx(ptr %p, i64 %n) {
; CHECK-NEXT: cmp x1, #1
; CHECK-NEXT: b.lt .LBB0_5
; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w9, [x0], #4
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: ldr w9, [x8], #4
+; CHECK-NEXT: add w0, w0, w9
+; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: b.lo .LBB0_5
; CHECK-NEXT: // %bb.3: // %for.inc
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: subs x1, x1, #1
; CHECK-NEXT: b.ne .LBB0_2
; CHECK-NEXT: // %bb.4: // %cleanup
-; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_5:
-; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
entry:
%cmp1 = icmp sgt i64 %n, 0
@@ -58,23 +57,22 @@ define i32 @i32_initially_offset(ptr %p, i64 %n) {
; CHECK-NEXT: cmp x1, #1
; CHECK-NEXT: b.lt .LBB1_5
; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: .LBB1_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr w9, [x0], #4
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: ldr w9, [x8], #4
+; CHECK-NEXT: add w0, w0, w9
+; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: b.lo .LBB1_5
; CHECK-NEXT: // %bb.3: // %for.cond
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: subs x1, x1, #1
; CHECK-NEXT: b.ne .LBB1_2
; CHECK-NEXT: // %bb.4: // %cleanup
-; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_5:
-; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
entry:
%cmp1 = icmp sgt i64 %n, 0
diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
index 7353acd7228cd..1f7a5b12d16b8 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
@@ -7,27 +7,23 @@ target triple = "riscv64-unknown-linux-gnu"
define ptr @foo(ptr %a0, ptr %a1, i64 %a2) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mv a3, a0
; CHECK-NEXT: vsetvli a4, a2, e8, m8, ta, ma
-; CHECK-NEXT: bne a4, a2, .LBB0_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a1)
-; CHECK-NEXT: vse8.v v8, (a0)
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB0_2: # %if.then
+; CHECK-NEXT: beq a4, a2, .LBB0_4
+; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: add a2, a0, a2
; CHECK-NEXT: sub a5, a2, a4
-; CHECK-NEXT: mv a3, a0
-; CHECK-NEXT: .LBB0_3: # %do.body
+; CHECK-NEXT: .LBB0_2: # %do.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle8.v v8, (a1)
; CHECK-NEXT: vse8.v v8, (a3)
; CHECK-NEXT: add a3, a3, a4
; CHECK-NEXT: add a1, a1, a4
-; CHECK-NEXT: bltu a3, a5, .LBB0_3
-; CHECK-NEXT: # %bb.4: # %do.end
+; CHECK-NEXT: bltu a3, a5, .LBB0_2
+; CHECK-NEXT: # %bb.3: # %do.end
; CHECK-NEXT: sub a2, a2, a3
; CHECK-NEXT: vsetvli a2, a2, e8, m8, ta, ma
+; CHECK-NEXT: .LBB0_4: # %if.end
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a1)
; CHECK-NEXT: vse8.v v8, (a3)
More information about the llvm-commits
mailing list