[llvm] [RegAlloc] Fix the terminal rule check for interfere with DstReg (PR #168661)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 07:47:20 PST 2025
https://github.com/hstk30 updated https://github.com/llvm/llvm-project/pull/168661
>From 1d840a5792729db13691d1a03f0387c45c6be5f1 Mon Sep 17 00:00:00 2001
From: hstk30 <hanwei62 at huawei.com>
Date: Wed, 19 Nov 2025 12:37:58 +0800
Subject: [PATCH 1/2] Fix the terminal rule check for interfere with DstReg
---
llvm/lib/CodeGen/RegisterCoalescer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 25c4375a73ce0..e624088a0964e 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -4150,7 +4150,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
continue;
Register OtherSrcReg, OtherReg;
unsigned OtherSrcSubReg = 0, OtherSubReg = 0;
- if (!isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
+ if (!isMoveInstr(*TRI, &MI, OtherSrcReg, OtherReg, OtherSrcSubReg,
OtherSubReg))
return false;
if (OtherReg == SrcReg)
>From 2a7284c21c1b12ca6aac723d4fafe852bad6aa85 Mon Sep 17 00:00:00 2001
From: hstk30 <hanwei62 at huawei.com>
Date: Thu, 20 Nov 2025 23:46:37 +0800
Subject: [PATCH 2/2] Fix all failed testcases
---
.../AArch64/aarch64-matrix-umull-smull.ll | 14 +-
.../AArch64/complex-deinterleaving-crash.ll | 52 +-
...plex-deinterleaving-reductions-scalable.ll | 24 +-
.../complex-deinterleaving-reductions.ll | 30 +-
.../AArch64/machine-sink-kill-flags.ll | 3 +-
.../sve-extract-fixed-from-scalable-vector.ll | 12 +-
.../AArch64/sve-extract-fixed-vector.ll | 32 +-
.../AArch64/sve-fixed-length-reshuffle.ll | 12 +-
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 92 +-
...vergence-divergent-i1-used-outside-loop.ll | 8 +-
.../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 45 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 16 +-
.../atomic_optimizations_global_pointer.ll | 16 +-
.../atomic_optimizations_local_pointer.ll | 20 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 2348 +++++++-------
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 2498 ++++++++-------
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 2498 ++++++++-------
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 2808 ++++++++---------
.../AMDGPU/set-inactive-wwm-overwrite.ll | 8 +-
llvm/test/CodeGen/BPF/objdump_cond_op_2.ll | 3 +-
llvm/test/CodeGen/Hexagon/swp-stages5.ll | 1 -
llvm/test/CodeGen/NVPTX/atomics-b128.ll | 150 +-
llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 40 +-
llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 40 +-
llvm/test/CodeGen/NVPTX/atomics.ll | 12 +-
llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll | 6 +-
llvm/test/CodeGen/PowerPC/licm-xxsplti.ll | 36 +-
.../PowerPC/loop-instr-form-prepare.ll | 8 +-
llvm/test/CodeGen/PowerPC/sink-side-effect.ll | 2 +-
llvm/test/CodeGen/PowerPC/sms-phi-1.ll | 5 +-
llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll | 10 +-
llvm/test/CodeGen/RISCV/branch-on-zero.ll | 16 +-
.../rvv/fixed-vectors-shuffle-exact-vlen.ll | 10 +-
llvm/test/CodeGen/RISCV/rvv/pr95865.ll | 43 +-
llvm/test/CodeGen/RISCV/rvv/remat.ll | 57 +-
llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll | 66 +-
.../CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll | 28 +-
llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll | 12 +-
.../RISCV/rvv/vxrm-insert-out-of-loop.ll | 24 +-
.../varying-outer-2d-reduction.ll | 34 +-
.../Thumb2/LowOverheadLoops/while-loops.ll | 91 +-
.../CodeGen/Thumb2/mve-gather-increment.ll | 24 +-
.../Thumb2/mve-gather-scatter-optimisation.ll | 90 +-
.../Thumb2/mve-laneinterleaving-reduct.ll | 89 +-
llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll | 52 +-
.../CodeGen/WebAssembly/simd-shift-in-loop.ll | 14 +-
.../CodeGen/X86/AMX/amx-ldtilecfg-insert.ll | 18 +-
llvm/test/CodeGen/X86/i128-mul.ll | 178 +-
.../test/CodeGen/X86/loop-strength-reduce5.ll | 10 +-
llvm/test/CodeGen/X86/madd.ll | 22 +-
llvm/test/CodeGen/X86/pr49451.ll | 6 +-
...lar-shift-by-byte-multiple-legalization.ll | 114 +-
llvm/test/CodeGen/X86/x86-shrink-wrapping.ll | 12 +-
llvm/test/CodeGen/X86/xor.ll | 132 +-
54 files changed, 5937 insertions(+), 6054 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 4894932d3c9b1..99c540366fb12 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -803,20 +803,20 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
; CHECK-SD-NEXT: smlal2 v4.2d, v16.4s, v20.4s
; CHECK-SD-NEXT: smlal v6.2d, v16.2s, v20.2s
; CHECK-SD-NEXT: smlal v3.2d, v16.2s, v19.2s
-; CHECK-SD-NEXT: smlal2 v1.2d, v16.4s, v18.4s
+; CHECK-SD-NEXT: smlal2 v0.2d, v16.4s, v18.4s
; CHECK-SD-NEXT: smlal v7.2d, v16.2s, v17.2s
-; CHECK-SD-NEXT: smlal v0.2d, v16.2s, v18.2s
+; CHECK-SD-NEXT: smlal v1.2d, v16.2s, v18.2s
; CHECK-SD-NEXT: smlal2 v5.2d, v16.4s, v17.4s
; CHECK-SD-NEXT: b.ne .LBB6_7
; CHECK-SD-NEXT: // %bb.8: // %middle.block
-; CHECK-SD-NEXT: add v0.2d, v0.2d, v6.2d
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d
; CHECK-SD-NEXT: cmp x10, x9
-; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d
-; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d
-; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: b.eq .LBB6_15
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 7542e9c4b8f5b..a4f20905a85c2 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -35,15 +35,15 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-LABEL: check_deinterleaving_has_deinterleave:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: add x8, x0, #16
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: movi v4.2d, #0000000000000000
+; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: movi v5.2d, #0000000000000000
-; CHECK-NEXT: movi v7.2d, #0000000000000000
; CHECK-NEXT: movi v6.2d, #0000000000000000
+; CHECK-NEXT: movi v7.2d, #0000000000000000
; CHECK-NEXT: movi v16.2d, #0000000000000000
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
@@ -64,31 +64,31 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
; CHECK-NEXT: ushll v24.4s, v18.4h, #0
; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0
; CHECK-NEXT: ushll v20.4s, v20.4h, #0
-; CHECK-NEXT: and v21.16b, v21.16b, v1.16b
-; CHECK-NEXT: and v19.16b, v19.16b, v1.16b
-; CHECK-NEXT: and v22.16b, v22.16b, v1.16b
-; CHECK-NEXT: and v17.16b, v17.16b, v1.16b
-; CHECK-NEXT: and v23.16b, v23.16b, v1.16b
-; CHECK-NEXT: and v24.16b, v24.16b, v1.16b
-; CHECK-NEXT: and v18.16b, v18.16b, v1.16b
-; CHECK-NEXT: and v20.16b, v20.16b, v1.16b
-; CHECK-NEXT: add v4.4s, v4.4s, v19.4s
-; CHECK-NEXT: add v2.4s, v2.4s, v21.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v22.4s
-; CHECK-NEXT: add v3.4s, v3.4s, v17.4s
+; CHECK-NEXT: and v21.16b, v21.16b, v2.16b
+; CHECK-NEXT: and v19.16b, v19.16b, v2.16b
+; CHECK-NEXT: and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT: and v17.16b, v17.16b, v2.16b
+; CHECK-NEXT: and v23.16b, v23.16b, v2.16b
+; CHECK-NEXT: and v24.16b, v24.16b, v2.16b
+; CHECK-NEXT: and v18.16b, v18.16b, v2.16b
+; CHECK-NEXT: and v20.16b, v20.16b, v2.16b
+; CHECK-NEXT: add v5.4s, v5.4s, v19.4s
+; CHECK-NEXT: add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v22.4s
+; CHECK-NEXT: add v4.4s, v4.4s, v17.4s
; CHECK-NEXT: add v16.4s, v16.4s, v23.4s
-; CHECK-NEXT: add v5.4s, v5.4s, v24.4s
-; CHECK-NEXT: add v6.4s, v6.4s, v20.4s
-; CHECK-NEXT: add v7.4s, v7.4s, v18.4s
+; CHECK-NEXT: add v6.4s, v6.4s, v24.4s
+; CHECK-NEXT: add v7.4s, v7.4s, v20.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: add v1.4s, v7.4s, v3.4s
-; CHECK-NEXT: add v3.4s, v16.4s, v4.4s
-; CHECK-NEXT: add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT: add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT: add v2.4s, v16.4s, v5.4s
+; CHECK-NEXT: add v1.4s, v6.4s, v1.4s
+; CHECK-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 4f00aed3aa4bc..ddeeca7d5df50 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -31,14 +31,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
-; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
-; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
-; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
-; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
+; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
+; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
+; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %exit.block
-; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
-; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
+; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
; CHECK-NEXT: faddv d0, p0, z2.d
; CHECK-NEXT: faddv d1, p0, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
@@ -205,20 +205,20 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
; CHECK-NEXT: add x1, x1, x10
-; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
-; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
+; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
+; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0
; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0
-; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90
-; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90
+; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90
+; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90
; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90
; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d
; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d
; CHECK-NEXT: fadd z1.d, z4.d, z5.d
; CHECK-NEXT: fadd z2.d, z2.d, z0.d
; CHECK-NEXT: faddv d0, p0, z1.d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index aed3072bb4af3..355adec955e4b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -25,14 +25,14 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q3, q2, [x9]
; CHECK-NEXT: cmp x8, #1600
; CHECK-NEXT: ldp q5, q4, [x10]
-; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #0
-; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #0
-; CHECK-NEXT: fcmla v0.2d, v5.2d, v3.2d, #90
-; CHECK-NEXT: fcmla v1.2d, v4.2d, v2.2d, #90
+; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #0
+; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #0
+; CHECK-NEXT: fcmla v1.2d, v5.2d, v3.2d, #90
+; CHECK-NEXT: fcmla v0.2d, v4.2d, v2.2d, #90
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: zip2 v2.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: zip2 v2.2d, v1.2d, v0.2d
+; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: faddp d1, v2.2d
; CHECK-NEXT: ret
@@ -159,20 +159,20 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: ldp q17, q16, [x8], #64
; CHECK-NEXT: ldp q19, q18, [x9], #64
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0
-; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0
-; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0
+; CHECK-NEXT: fcmla v1.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT: fcmla v0.2d, v19.2d, v17.2d, #0
; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0
; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90
-; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90
-; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90
+; CHECK-NEXT: fcmla v1.2d, v6.2d, v4.2d, #90
+; CHECK-NEXT: fcmla v0.2d, v19.2d, v17.2d, #90
; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d
-; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d
-; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d
-; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d
-; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: zip2 v4.2d, v0.2d, v3.2d
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v3.2d
+; CHECK-NEXT: zip2 v3.2d, v2.2d, v1.2d
+; CHECK-NEXT: zip1 v1.2d, v2.2d, v1.2d
+; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: faddp d1, v1.2d
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
index 338084295fc7f..0fe4683d97a23 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
@@ -16,8 +16,9 @@ define i32 @test(ptr %ptr) {
; CHECK-NEXT: mov w9, wzr
; CHECK-NEXT: LBB0_1: ; %.thread
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: lsr w11, w9, #1
; CHECK-NEXT: sub w10, w9, #1
-; CHECK-NEXT: lsr w9, w9, #1
+; CHECK-NEXT: mov w9, w11
; CHECK-NEXT: tbnz w10, #0, LBB0_1
; CHECK-NEXT: ; %bb.2: ; %bb343
; CHECK-NEXT: and w9, w10, #0x1
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index 52a77cb396909..6c6a691760af3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) {
define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) {
; CHECK-LABEL: extract_v4i1_nxv32i1_0:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
+; CHECK-NEXT: umov w8, v1.b[1]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: umov w9, v1.b[2]
; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: umov w8, v1.b[2]
-; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: umov w8, v1.b[3]
+; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0)
ret <4 x i1> %ext
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 72994100b2970..1cefe96962e29 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
; CHECK-LABEL: extract_v4i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov w9, v1.s[2]
; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[2]
-; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0)
ret <4 x i1> %mask
@@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
; CHECK-LABEL: extract_v8i1_nxv8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1
+; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: umov w9, v1.h[2]
; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[2], w8
; CHECK-NEXT: umov w8, v1.h[3]
+; CHECK-NEXT: mov v0.b[2], w9
+; CHECK-NEXT: umov w9, v1.h[4]
; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: umov w8, v1.h[4]
-; CHECK-NEXT: mov v0.b[4], w8
; CHECK-NEXT: umov w8, v1.h[5]
+; CHECK-NEXT: mov v0.b[4], w9
+; CHECK-NEXT: umov w9, v1.h[6]
; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: umov w8, v1.h[6]
-; CHECK-NEXT: mov v0.b[6], w8
; CHECK-NEXT: umov w8, v1.h[7]
+; CHECK-NEXT: mov v0.b[6], w9
; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0)
ret <8 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
index 8e807cda7166d..41e4a38fad90b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
@@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 {
; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
+; CHECK-NEXT: mov w8, v1.s[1]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov w9, v1.s[2]
; CHECK-NEXT: mov v0.h[1], w8
-; CHECK-NEXT: mov w8, v1.s[2]
-; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%el0 = extractelement <vscale x 4 x i1> %a, i32 0
%el1 = extractelement <vscale x 4 x i1> %a, i32 1
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 935189dec48ac..74a717f1635a3 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB24_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT: mov x8, x0
+; CHECK-BE-NEXT: add x8, x0, #16
; CHECK-BE-NEXT: ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT: add x0, x0, #16
-; CHECK-BE-NEXT: add x9, x8, #48
-; CHECK-BE-NEXT: ld1 { v3.8h }, [x0]
+; CHECK-BE-NEXT: ld1 { v3.8h }, [x8]
+; CHECK-BE-NEXT: add x9, x0, #48
+; CHECK-BE-NEXT: add x10, x0, #32
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
@@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h
; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h
; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT: st1 { v4.4s }, [x8]
-; CHECK-BE-NEXT: add x8, x8, #32
+; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
+; CHECK-BE-NEXT: mov x0, x8
; CHECK-BE-NEXT: st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT: st1 { v0.4s }, [x8]
-; CHECK-BE-NEXT: st1 { v1.4s }, [x0]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
; CHECK-BE-NEXT: b.ne .LBB24_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
@@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB25_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v4.16b }, [x0]
-; CHECK-BE-NEXT: add x10, x1, #48
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: add x8, x1, #32
+; CHECK-BE-NEXT: ld1 { v18.4s }, [x9]
; CHECK-BE-NEXT: ld1 { v16.4s }, [x1]
-; CHECK-BE-NEXT: add x9, x1, #32
-; CHECK-BE-NEXT: ld1 { v18.4s }, [x10]
; CHECK-BE-NEXT: add x1, x1, #16
-; CHECK-BE-NEXT: ld1 { v20.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v20.4s }, [x8]
; CHECK-BE-NEXT: ld1 { v22.4s }, [x1]
-; CHECK-BE-NEXT: add x9, x0, #96
+; CHECK-BE-NEXT: add x8, x0, #96
; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b
; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b
; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8
-; CHECK-BE-NEXT: mov x8, x0
+; CHECK-BE-NEXT: add x9, x0, #32
; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8
-; CHECK-BE-NEXT: add x10, x0, #32
+; CHECK-BE-NEXT: add x10, x0, #16
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT: rev32 v5.8b, v5.8b
; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8
+; CHECK-BE-NEXT: rev32 v5.8b, v5.8b
; CHECK-BE-NEXT: rev32 v21.8b, v7.8b
; CHECK-BE-NEXT: rev32 v23.8b, v4.8b
; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8
@@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: rev32 v4.8b, v4.8b
; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s
; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s
-; CHECK-BE-NEXT: st1 { v5.2d }, [x9]
+; CHECK-BE-NEXT: st1 { v5.2d }, [x8]
; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s
; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s
-; CHECK-BE-NEXT: add x9, x0, #112
+; CHECK-BE-NEXT: add x8, x0, #112
; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s
-; CHECK-BE-NEXT: st1 { v18.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x0, #80
+; CHECK-BE-NEXT: st1 { v18.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x0, #80
; CHECK-BE-NEXT: st1 { v22.2d }, [x0]
-; CHECK-BE-NEXT: add x0, x0, #64
-; CHECK-BE-NEXT: st1 { v17.2d }, [x9]
-; CHECK-BE-NEXT: add x9, x8, #48
-; CHECK-BE-NEXT: add x8, x8, #16
-; CHECK-BE-NEXT: st1 { v19.2d }, [x10]
-; CHECK-BE-NEXT: st1 { v5.2d }, [x0]
+; CHECK-BE-NEXT: st1 { v17.2d }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #64
+; CHECK-BE-NEXT: st1 { v19.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x0, #48
+; CHECK-BE-NEXT: mov x0, x8
+; CHECK-BE-NEXT: st1 { v5.2d }, [x8]
; CHECK-BE-NEXT: st1 { v6.2d }, [x9]
-; CHECK-BE-NEXT: st1 { v4.2d }, [x8]
+; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
; CHECK-BE-NEXT: b.ne .LBB25_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
@@ -3093,14 +3093,13 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB26_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16
-; CHECK-BE-NEXT: mov x8, x0
-; CHECK-BE-NEXT: add x9, x0, #32
+; CHECK-BE-NEXT: add x8, x0, #32
; CHECK-BE-NEXT: ld1 { v16.4s }, [x0]
-; CHECK-BE-NEXT: add x10, x0, #48
-; CHECK-BE-NEXT: add x0, x0, #16
-; CHECK-BE-NEXT: ld1 { v17.4s }, [x9]
-; CHECK-BE-NEXT: ld1 { v18.4s }, [x10]
-; CHECK-BE-NEXT: ld1 { v19.4s }, [x0]
+; CHECK-BE-NEXT: add x9, x0, #48
+; CHECK-BE-NEXT: add x10, x0, #16
+; CHECK-BE-NEXT: ld1 { v17.4s }, [x8]
+; CHECK-BE-NEXT: ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v19.4s }, [x10]
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b
@@ -3114,10 +3113,11 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s
; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s
; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s
-; CHECK-BE-NEXT: st1 { v5.4s }, [x8]
-; CHECK-BE-NEXT: st1 { v6.4s }, [x9]
-; CHECK-BE-NEXT: st1 { v7.4s }, [x10]
-; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
+; CHECK-BE-NEXT: st1 { v5.4s }, [x0]
+; CHECK-BE-NEXT: mov x0, x10
+; CHECK-BE-NEXT: st1 { v6.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v7.4s }, [x9]
+; CHECK-BE-NEXT: st1 { v4.4s }, [x10]
; CHECK-BE-NEXT: b.ne .LBB26_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
@@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB28_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT: mov x8, x0
+; CHECK-BE-NEXT: add x8, x0, #16
; CHECK-BE-NEXT: ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT: add x0, x0, #16
-; CHECK-BE-NEXT: add x9, x8, #48
-; CHECK-BE-NEXT: ld1 { v3.8h }, [x0]
+; CHECK-BE-NEXT: ld1 { v3.8h }, [x8]
+; CHECK-BE-NEXT: add x9, x0, #48
+; CHECK-BE-NEXT: add x10, x0, #32
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
@@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h
; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h
; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT: st1 { v4.4s }, [x8]
-; CHECK-BE-NEXT: add x8, x8, #32
+; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
+; CHECK-BE-NEXT: mov x0, x8
; CHECK-BE-NEXT: st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT: st1 { v0.4s }, [x8]
-; CHECK-BE-NEXT: st1 { v1.4s }, [x0]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
; CHECK-BE-NEXT: b.ne .LBB28_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index c1e6b4fffa82d..8372d22b72afc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -21,14 +21,14 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX10-NEXT: s_mov_b32 s8, exec_lo
-; GFX10-NEXT: s_mov_b32 s9, s5
; GFX10-NEXT: s_add_i32 s6, s6, 1
-; GFX10-NEXT: s_xor_b32 s5, s5, s8
+; GFX10-NEXT: s_xor_b32 s8, s5, s8
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
-; GFX10-NEXT: s_and_b32 s8, exec_lo, s9
-; GFX10-NEXT: s_or_b32 s7, s7, s8
+; GFX10-NEXT: s_and_b32 s9, exec_lo, s5
+; GFX10-NEXT: s_mov_b32 s5, s8
+; GFX10-NEXT: s_or_b32 s7, s7, s9
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %exit
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 9a90faf723461..7bd1ff2201977 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -78,13 +78,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3]
-; GFX11-NEXT: global_load_b32 v5, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v6, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v3, v6, v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -127,14 +126,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v6, v1, s[2:3]
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -224,14 +222,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v6, v0, s[2:3]
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v4
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -523,28 +520,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
-; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3]
+; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5]
; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4]
+; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX11-NEXT: s_cbranch_execz .LBB10_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5]
-; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: .LBB10_2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2
; GFX11-NEXT: s_cbranch_execz .LBB10_4
; GFX11-NEXT: ; %bb.3: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6
+; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: .LBB10_4: ; %endif
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3eecaccf0308f..43ff9ee1ad27d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -3131,8 +3131,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6]
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
@@ -3143,8 +3143,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -3155,8 +3155,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v3, v4
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3176,8 +3176,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4
-; GFX11-NEXT: v_mov_b32_e32 v5, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
+; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 88e3c86c791de..50e28a7245db8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3]
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_mov_b32_e32 v0, v4
-; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1]
-; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
+; GFX1164-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_uniform:
@@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3]
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_mov_b32_e32 v0, v4
-; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1]
-; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
+; GFX1132-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i64_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 0a098eb6582c7..a9938f17dacb7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_mov_b32_e32 v0, v4
-; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1]
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_uniform:
@@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5]
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_mov_b32_e32 v0, v4
-; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1]
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0
+; GFX1132-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
entry:
%old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index b6eaaf1369ab4..d499b3d5576d7 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -326,12 +326,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB2_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -347,12 +347,12 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB2_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -440,12 +440,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -462,12 +462,12 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -880,14 +880,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4]
+; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
+; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -914,14 +913,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
+; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
+; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -938,14 +936,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
+; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
@@ -970,13 +968,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -992,13 +990,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1014,13 +1012,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1036,13 +1034,13 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1065,14 +1063,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[1:2], 4.0, v[3:4]
+; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
+; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1099,14 +1096,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
+; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
+; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1123,14 +1119,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
+; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
@@ -1155,13 +1151,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1177,13 +1173,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1199,13 +1195,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], 4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0
+; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB7_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1222,13 +1218,13 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], 4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], 4.0
+; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB7_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1333,30 +1329,30 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX942-NEXT: ds_read_b32 v2, v1
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX942-NEXT: ds_read_b32 v3, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_and_b32_e32 v0, 24, v3
-; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v3, v3
+; GFX942-NEXT: v_and_b32_e32 v0, 24, v2
+; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0
+; GFX942-NEXT: v_not_b32_e32 v2, v2
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16:
@@ -1467,30 +1463,30 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX90A-NEXT: ds_read_b32 v2, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX90A-NEXT: ds_read_b32 v3, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3
-; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fadd_ret_f16:
@@ -1721,30 +1717,30 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_add_u32_e32 v0, 0xfffe, v0
; GFX942-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX942-NEXT: ds_read_b32 v2, v1
+; GFX942-NEXT: ds_read_b32 v3, v1
; GFX942-NEXT: v_and_b32_e32 v0, 3, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0
-; GFX942-NEXT: v_not_b32_e32 v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0
+; GFX942-NEXT: v_not_b32_e32 v2, v2
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset:
@@ -1861,30 +1857,30 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0
; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX90A-NEXT: ds_read_b32 v2, v1
+; GFX90A-NEXT: ds_read_b32 v3, v1
; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4
-; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fadd_ret_f16__offset:
@@ -2036,27 +2032,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2077,28 +2073,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2123,15 +2119,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2144,27 +2140,27 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2179,28 +2175,28 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2215,23 +2211,23 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
@@ -2253,15 +2249,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2282,15 +2278,15 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2312,16 +2308,16 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2342,18 +2338,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2374,18 +2370,18 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2416,19 +2412,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2459,20 +2455,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2498,15 +2493,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX942-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX942-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2529,19 +2524,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2566,20 +2561,19 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2602,16 +2596,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX10-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX10-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -2634,15 +2628,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX90A-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2664,15 +2658,15 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX908-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX908-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2695,16 +2689,16 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX8-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX8-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2726,18 +2720,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2759,18 +2753,18 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2852,19 +2846,19 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-LABEL: local_atomic_fadd_ret_f16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2
; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2955,19 +2949,19 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX90A-LABEL: local_atomic_fadd_ret_f16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2
; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3092,16 +3086,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -3124,17 +3118,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -3154,13 +3147,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3175,16 +3168,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3201,17 +3194,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3228,15 +3220,15 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
@@ -3253,13 +3245,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3275,13 +3267,13 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3297,14 +3289,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3320,16 +3312,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3346,16 +3338,16 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3491,27 +3483,27 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16:
@@ -3658,25 +3650,25 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fadd_ret_bf16:
@@ -3950,27 +3942,27 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset:
@@ -4123,25 +4115,25 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fadd_ret_bf16__offset:
@@ -4305,38 +4297,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -4357,37 +4349,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4413,22 +4405,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4441,38 +4433,38 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4487,37 +4479,37 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4532,28 +4524,28 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
@@ -4576,20 +4568,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4611,20 +4603,20 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4646,22 +4638,22 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4682,18 +4674,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4714,18 +4706,18 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4756,30 +4748,29 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -4810,29 +4801,28 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4859,22 +4849,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4898,29 +4888,28 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4946,28 +4935,27 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4990,21 +4978,21 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -5028,20 +5016,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5064,20 +5052,20 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5100,22 +5088,22 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5137,18 +5125,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5170,18 +5158,18 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5283,14 +5271,13 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -5305,6 +5292,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5421,14 +5409,13 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -5442,6 +5429,7 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5581,27 +5569,26 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5624,26 +5611,25 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -5661,24 +5647,24 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
-; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX942-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX942-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB19_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5694,26 +5680,25 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5731,25 +5716,24 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5766,21 +5750,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -5798,20 +5782,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5828,20 +5812,20 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5857,21 +5841,21 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5887,16 +5871,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5913,16 +5897,16 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6010,17 +5994,17 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX90A-LABEL: local_atomic_fadd_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6234,17 +6218,17 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX90A-LABEL: local_atomic_fadd_ret_v2f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6415,14 +6399,13 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v2, v3, v1
+; GFX11-NEXT: v_pk_add_f16 v3, v2, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6439,13 +6422,13 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_add_f16 v2, v3, v1
+; GFX10-NEXT: v_pk_add_f16 v3, v2, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
@@ -6461,12 +6444,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6481,12 +6464,12 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_add_f16 v2, v3, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX908-NEXT: v_pk_add_f16 v3, v2, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6502,14 +6485,14 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6631,14 +6614,13 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v2, v3, v1
+; GFX11-NEXT: v_pk_add_f16 v3, v2, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6655,13 +6637,13 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_add_f16 v2, v3, v1
+; GFX10-NEXT: v_pk_add_f16 v3, v2, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
@@ -6677,12 +6659,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6697,12 +6679,12 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_add_f16 v2, v3, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX908-NEXT: v_pk_add_f16 v3, v2, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6718,14 +6700,14 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6980,40 +6962,40 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fadd_ret_v2bf16:
@@ -7334,40 +7316,40 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fadd_ret_v2bf16__offset:
@@ -7565,32 +7547,30 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7611,32 +7591,30 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7656,27 +7634,27 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v2
; GFX10-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -7696,26 +7674,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7734,26 +7712,26 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v2
; GFX908-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7771,29 +7749,29 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7910,32 +7888,30 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7956,32 +7932,30 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8001,27 +7975,27 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v2
; GFX10-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -8041,26 +8015,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8079,26 +8053,26 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v2
; GFX908-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8116,29 +8090,29 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8875,20 +8849,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX7-NEXT: ; %bb.5:
; GFX7-NEXT: s_lshl_b32 s0, s3, 4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: ds_read_b32 v2, v1
+; GFX7-NEXT: ds_read_b32 v3, v1
; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
-; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v4, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_6
; GFX7-NEXT: .LBB28_7: ; %Flow21
@@ -8999,20 +8973,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX6-NEXT: ; %bb.5:
; GFX6-NEXT: s_lshl_b32 s0, s3, 4
; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: ds_read_b32 v2, v1
+; GFX6-NEXT: ds_read_b32 v3, v1
; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
-; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v4, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX6-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_6
; GFX6-NEXT: .LBB28_7: ; %Flow19
@@ -9703,20 +9677,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX7-NEXT: ; %bb.5:
; GFX7-NEXT: s_lshl_b32 s0, s3, 4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: ds_read_b32 v2, v1
+; GFX7-NEXT: ds_read_b32 v3, v1
; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
-; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
+; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v4, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_6
; GFX7-NEXT: .LBB29_7: ; %Flow21
@@ -9827,20 +9801,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX6-NEXT: ; %bb.5:
; GFX6-NEXT: s_lshl_b32 s0, s3, 4
; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: ds_read_b32 v2, v1
+; GFX6-NEXT: ds_read_b32 v3, v1
; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0
-; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0
+; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v4, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX6-NEXT: v_add_f32_e32 v4, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3
; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_6
; GFX6-NEXT: .LBB29_7: ; %Flow19
@@ -10110,12 +10084,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10131,12 +10105,12 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 8e094a7269a49..282c754473047 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -886,21 +886,21 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16:
@@ -1025,21 +1025,21 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f16:
@@ -1285,21 +1285,21 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset:
@@ -1430,21 +1430,21 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_f16__offset:
@@ -1598,29 +1598,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -1641,29 +1641,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -1688,16 +1688,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1710,29 +1710,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1747,29 +1747,29 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1784,24 +1784,24 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
@@ -1823,16 +1823,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1853,16 +1853,16 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1884,17 +1884,17 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1915,18 +1915,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1947,18 +1947,18 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1989,21 +1989,20 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2034,21 +2033,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2074,16 +2073,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2106,21 +2105,20 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2145,21 +2143,21 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2182,17 +2180,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -2215,16 +2213,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2246,16 +2244,16 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2278,17 +2276,17 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2310,18 +2308,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2343,18 +2341,18 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2439,13 +2437,12 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-LABEL: local_atomic_fmax_ret_f16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_max_f16_e32 v1, v2, v2
; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1
; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
@@ -2453,6 +2450,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2547,13 +2545,12 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX90A-LABEL: local_atomic_fmax_ret_f16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2
; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1
; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
@@ -2561,6 +2558,7 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2687,18 +2685,17 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2721,18 +2718,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2752,14 +2749,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2774,18 +2771,17 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2802,18 +2798,18 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2830,16 +2826,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX10-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX10-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
@@ -2856,14 +2852,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX90A-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX90A-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2879,14 +2875,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX908-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX908-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2902,15 +2898,15 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2926,16 +2922,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2952,16 +2948,16 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3097,27 +3093,27 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16:
@@ -3264,25 +3260,25 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_bf16:
@@ -3558,27 +3554,27 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset:
@@ -3731,25 +3727,25 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_bf16__offset:
@@ -3915,38 +3911,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -3967,37 +3963,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4023,22 +4019,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4051,38 +4047,38 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4097,37 +4093,37 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4142,28 +4138,28 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
@@ -4186,20 +4182,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4221,20 +4217,20 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4256,22 +4252,22 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4292,19 +4288,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4325,19 +4321,19 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4368,30 +4364,29 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -4422,29 +4417,28 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4471,22 +4465,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4510,29 +4504,28 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4558,28 +4551,27 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4602,21 +4594,21 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -4640,20 +4632,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4676,20 +4668,20 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4712,22 +4704,22 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4749,19 +4741,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4783,19 +4775,19 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4897,14 +4889,13 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1
; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -4919,6 +4910,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5035,14 +5027,13 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1
; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -5056,6 +5047,7 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5197,27 +5189,26 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5240,26 +5231,25 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -5280,21 +5270,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX942-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX942-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB19_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5310,26 +5300,25 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5347,25 +5336,24 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5382,21 +5370,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -5414,20 +5402,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5444,20 +5432,20 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5473,21 +5461,21 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5503,17 +5491,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5530,17 +5518,17 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5592,25 +5580,25 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX942-LABEL: local_atomic_fmax_ret_v2f16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v3, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v1, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB20_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_v2f16:
@@ -5668,24 +5656,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX90A-LABEL: local_atomic_fmax_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_v2f16:
@@ -5864,25 +5852,25 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX942-LABEL: local_atomic_fmax_ret_v2f16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v1, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX942-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_v2f16__offset:
@@ -5940,24 +5928,24 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX90A-LABEL: local_atomic_fmax_ret_v2f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_v2f16__offset:
@@ -6113,15 +6101,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
-; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6141,14 +6129,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v3, v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v3, v3, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB22_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6164,15 +6152,15 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX11-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6190,14 +6178,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
@@ -6214,13 +6202,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6236,13 +6224,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX908-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6260,16 +6248,16 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_max_f16_e32 v5, v5, v1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6375,15 +6363,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
-; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6403,14 +6391,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v3, v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX942-NEXT: v_pk_max_f16 v3, v3, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB23_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6426,15 +6414,15 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX11-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6452,14 +6440,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
@@ -6476,13 +6464,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6498,13 +6486,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX908-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_max_f16 v3, v3, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6522,16 +6510,16 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_max_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_max_f16_e32 v5, v5, v1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6738,41 +6726,41 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-LABEL: local_atomic_fmax_ret_v2bf16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v3, v0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_max_f32_e32 v2, v2, v3
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX942-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB24_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16:
@@ -6910,40 +6898,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_v2bf16:
@@ -7216,41 +7204,41 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_max_f32_e32 v2, v2, v3
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX942-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB25_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset:
@@ -7388,40 +7376,40 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmax_ret_v2bf16__offset:
@@ -7601,34 +7589,31 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -7653,33 +7638,32 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -7702,27 +7686,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v4, v4, v2
; GFX942-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7740,32 +7724,30 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7786,32 +7768,30 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7831,27 +7811,27 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v2
; GFX10-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -7871,26 +7851,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7909,26 +7889,26 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v4, v4, v2
; GFX908-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7946,29 +7926,29 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8067,34 +8047,31 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -8119,33 +8096,32 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -8168,27 +8144,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v4, v4, v2
; GFX942-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8206,32 +8182,30 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8252,32 +8226,30 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8297,27 +8269,27 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v2
; GFX10-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -8337,26 +8309,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8375,26 +8347,26 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_max_f32_e32 v4, v4, v2
; GFX908-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8412,29 +8384,29 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 0aa8d33ea7429..4a6428eb338ff 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -886,21 +886,21 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16:
@@ -1025,21 +1025,21 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f16:
@@ -1285,21 +1285,21 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset:
@@ -1430,21 +1430,21 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_f16__offset:
@@ -1598,29 +1598,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -1641,29 +1641,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -1688,16 +1688,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1710,29 +1710,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1747,29 +1747,29 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1784,24 +1784,24 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
@@ -1823,16 +1823,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1853,16 +1853,16 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1884,17 +1884,17 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1915,18 +1915,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1947,18 +1947,18 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1989,21 +1989,20 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2034,21 +2033,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2074,16 +2073,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX942-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2106,21 +2105,20 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2145,21 +2143,21 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2182,17 +2180,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX10-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX10-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX10-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -2215,16 +2213,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX90A-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2246,16 +2244,16 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX908-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX908-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX908-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2278,17 +2276,17 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2310,18 +2308,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2343,18 +2341,18 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2439,13 +2437,12 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-LABEL: local_atomic_fmin_ret_f16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_max_f16_e32 v1, v2, v2
; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1
; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
@@ -2453,6 +2450,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2547,13 +2545,12 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX90A-LABEL: local_atomic_fmin_ret_f16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2
; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1
; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
@@ -2561,6 +2558,7 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2687,18 +2685,17 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
-; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2721,18 +2718,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2
-; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2752,14 +2749,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2774,18 +2771,17 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
-; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2802,18 +2798,18 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2830,16 +2826,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX10-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX10-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
@@ -2856,14 +2852,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX90A-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX90A-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2879,14 +2875,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX908-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX908-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2902,15 +2898,15 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2926,16 +2922,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2952,16 +2948,16 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3097,27 +3093,27 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16:
@@ -3264,25 +3260,25 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_bf16:
@@ -3558,27 +3554,27 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset:
@@ -3731,25 +3727,25 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_bf16__offset:
@@ -3915,38 +3911,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -3967,37 +3963,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4023,22 +4019,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4051,38 +4047,38 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4097,37 +4093,37 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4142,28 +4138,28 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
@@ -4186,20 +4182,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4221,20 +4217,20 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4256,22 +4252,22 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4292,19 +4288,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4325,19 +4321,19 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4368,30 +4364,29 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -4422,29 +4417,28 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4471,22 +4465,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4510,29 +4504,28 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4558,28 +4551,27 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4602,21 +4594,21 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -4640,20 +4632,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4676,20 +4668,20 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4712,22 +4704,22 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4749,19 +4741,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4783,19 +4775,19 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4897,14 +4889,13 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1
; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -4919,6 +4910,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5035,14 +5027,13 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1
; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -5056,6 +5047,7 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5197,27 +5189,26 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5240,26 +5231,25 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -5280,21 +5270,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX942-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX942-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB19_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5310,26 +5300,25 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5347,25 +5336,24 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5382,21 +5370,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -5414,20 +5402,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5444,20 +5432,20 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5473,21 +5461,21 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5503,17 +5491,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5530,17 +5518,17 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5592,25 +5580,25 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX942-LABEL: local_atomic_fmin_ret_v2f16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v3, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v1, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v1, v1, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB20_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_v2f16:
@@ -5668,24 +5656,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX90A-LABEL: local_atomic_fmin_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_v2f16:
@@ -5864,25 +5852,25 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX942-LABEL: local_atomic_fmin_ret_v2f16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX942-NEXT: v_pk_max_f16 v2, v1, v1
; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v1, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX942-NEXT: v_pk_min_f16 v1, v1, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmin_ret_v2f16__offset:
@@ -5940,24 +5928,24 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX90A-LABEL: local_atomic_fmin_ret_v2f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX90A-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v1, v1, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_v2f16__offset:
@@ -6113,15 +6101,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
-; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6141,14 +6129,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v3, v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v3, v3, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB22_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6164,15 +6152,15 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX11-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6190,14 +6178,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX10-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
@@ -6214,13 +6202,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6236,13 +6224,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX908-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6260,16 +6248,16 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_min_f16_e32 v5, v5, v1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6375,15 +6363,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
-; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1
+; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6403,14 +6391,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX942-NEXT: v_pk_max_f16 v3, v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX942-NEXT: v_pk_min_f16 v3, v3, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB23_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6426,15 +6414,15 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX11-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX11-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6452,14 +6440,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX10-NEXT: v_pk_min_f16 v2, v2, v1
+; GFX10-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
@@ -6476,13 +6464,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6498,13 +6486,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX908-NEXT: v_pk_min_f16 v2, v2, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX908-NEXT: v_pk_max_f16 v3, v2, v2
+; GFX908-NEXT: v_pk_min_f16 v3, v3, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6522,16 +6510,16 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v4, v4
-; GFX8-NEXT: v_min_f16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v3, v3
+; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_min_f16_e32 v5, v5, v1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6738,41 +6726,41 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-LABEL: local_atomic_fmin_ret_v2bf16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v3, v0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_min_f32_e32 v2, v2, v3
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX942-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB24_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16:
@@ -6910,40 +6898,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_v2bf16:
@@ -7216,41 +7204,41 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_min_f32_e32 v2, v2, v3
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX942-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB25_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset:
@@ -7388,40 +7376,40 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fmin_ret_v2bf16__offset:
@@ -7601,34 +7589,31 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -7653,33 +7638,32 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -7702,27 +7686,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v4, v4, v2
; GFX942-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7740,32 +7724,30 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7786,32 +7768,30 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -7831,27 +7811,27 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v2
; GFX10-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -7871,26 +7851,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7909,26 +7889,26 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v4, v4, v2
; GFX908-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7946,29 +7926,29 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8067,34 +8047,31 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -8119,33 +8096,32 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -8168,27 +8144,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v4, v4, v2
; GFX942-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8206,32 +8182,30 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8252,32 +8226,30 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8297,27 +8269,27 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v2
; GFX10-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -8337,26 +8309,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8375,26 +8347,26 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_min_f32_e32 v4, v4, v2
; GFX908-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8412,29 +8384,29 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 929bb61ddabcf..7f95e7df3f6b5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -50,17 +50,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX942-LABEL: local_atomic_fsub_ret_f32:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0
+; GFX942-NEXT: ds_read_b32 v2, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -119,17 +119,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX90A-LABEL: local_atomic_fsub_ret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
+; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -262,17 +262,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-LABEL: local_atomic_fsub_ret_f32__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -331,17 +331,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-LABEL: local_atomic_fsub_ret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -453,14 +453,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -479,12 +478,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -499,14 +498,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -523,13 +521,13 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
@@ -545,12 +543,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -565,12 +563,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB2_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -586,12 +584,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -607,12 +605,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB2_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -628,12 +626,12 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB2_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -656,14 +654,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -682,12 +679,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB3_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -702,14 +699,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -726,13 +722,13 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
@@ -748,12 +744,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -768,12 +764,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -789,12 +785,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB3_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -810,12 +806,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
+; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -832,12 +828,12 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -887,18 +883,18 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX942-LABEL: local_atomic_fsub_ret_f64:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: ds_read_b64 v[4:5], v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: ds_read_b64 v[0:1], v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -4.0
; GFX942-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB4_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -957,18 +953,18 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX90A-LABEL: local_atomic_fsub_ret_f64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ds_read_b64 v[4:5], v0
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -4.0
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1104,18 +1100,18 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-LABEL: local_atomic_fsub_ret_f64__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: ds_read_b64 v[4:5], v0 offset:65528
; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: ds_read_b64 v[0:1], v0 offset:65528
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: v_add_f64 v[0:1], v[4:5], -4.0
; GFX942-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB5_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1174,18 +1170,18 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-LABEL: local_atomic_fsub_ret_f64__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: ds_read_b64 v[4:5], v0 offset:65528
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0 offset:65528
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -4.0
; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[4:5], v[0:1] offset:65528
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1300,14 +1296,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4]
+; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
+; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1326,12 +1321,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0
-; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3]
+; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0
+; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB6_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1346,14 +1341,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
+; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4]
+; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1370,14 +1364,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
+; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
@@ -1393,12 +1387,12 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3]
+; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0
+; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1413,13 +1407,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1435,13 +1429,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1457,13 +1451,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1479,13 +1473,13 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2]
+; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1508,14 +1502,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[1:2], -4.0, v[3:4]
+; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
+; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1534,12 +1527,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], -4.0
-; GFX942-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528
+; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], -4.0
+; GFX942-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB7_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1554,14 +1547,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
+; GFX11-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b64 v[1:2], v0, v[1:2], v[3:4] offset:65528
+; GFX11-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -1578,14 +1570,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
+; GFX10-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX10-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[1:2], v[3:4]
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
@@ -1601,12 +1593,12 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v0, v[4:5], v[2:3] offset:65528
+; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], -4.0
+; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v0, v[2:3], v[4:5] offset:65528
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1621,13 +1613,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX908-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX908-NEXT: v_mov_b32_e32 v1, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1643,13 +1635,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX8-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1665,13 +1657,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_add_f64 v[1:2], v[3:4], -4.0
-; GFX7-NEXT: ds_cmpst_rtn_b64 v[1:2], v0, v[3:4], v[1:2] offset:65528
+; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], -4.0
+; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] offset:65528
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[3:4]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB7_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1688,13 +1680,13 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], -4.0
-; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX6-NEXT: v_add_f64 v[3:4], v[0:1], -4.0
+; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v2, v[0:1], v[3:4]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB7_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1799,30 +1791,30 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX942-NEXT: ds_read_b32 v2, v1
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX942-NEXT: ds_read_b32 v3, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_and_b32_e32 v0, 24, v3
-; GFX942-NEXT: v_lshlrev_b32_e64 v3, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v3, v3
+; GFX942-NEXT: v_and_b32_e32 v0, 24, v2
+; GFX942-NEXT: v_lshlrev_b32_e64 v2, v2, s0
+; GFX942-NEXT: v_not_b32_e32 v2, v2
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16:
@@ -1933,30 +1925,30 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX90A-NEXT: ds_read_b32 v2, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX90A-NEXT: ds_read_b32 v3, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_and_b32_e32 v0, 24, v3
-; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2
+; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fsub_ret_f16:
@@ -2187,30 +2179,30 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_add_u32_e32 v0, 0xfffe, v0
; GFX942-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX942-NEXT: ds_read_b32 v2, v1
+; GFX942-NEXT: ds_read_b32 v3, v1
; GFX942-NEXT: v_and_b32_e32 v0, 3, v0
; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v3, v0, s0
-; GFX942-NEXT: v_not_b32_e32 v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e64 v2, v0, s0
+; GFX942-NEXT: v_not_b32_e32 v2, v2
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX942-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB9_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset:
@@ -2327,30 +2319,30 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_u32_e32 v0, 0xfffe, v0
; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX90A-NEXT: ds_read_b32 v2, v1
+; GFX90A-NEXT: ds_read_b32 v3, v1
; GFX90A-NEXT: v_and_b32_e32 v0, 3, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v3, v0, s4
-; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v0, s4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fsub_ret_f16__offset:
@@ -2502,27 +2494,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2543,28 +2535,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2589,15 +2581,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2610,27 +2602,27 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2645,28 +2637,28 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -2681,23 +2673,23 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
@@ -2719,15 +2711,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2748,15 +2740,15 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB10_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2778,16 +2770,16 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2808,18 +2800,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2840,18 +2832,18 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2882,19 +2874,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -2925,20 +2917,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -2964,15 +2955,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX942-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX942-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2995,19 +2986,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3032,20 +3023,19 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3068,16 +3058,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX10-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX10-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
@@ -3100,15 +3090,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX90A-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX90A-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3130,15 +3120,15 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX908-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX908-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB11_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3161,16 +3151,16 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX8-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX8-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3192,18 +3182,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3225,18 +3215,18 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3318,19 +3308,19 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-LABEL: local_atomic_fsub_ret_f16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2
; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3421,19 +3411,19 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX90A-LABEL: local_atomic_fsub_ret_f16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2
; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3558,16 +3548,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -3590,17 +3580,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -3620,13 +3609,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX942-NEXT: v_and_or_b32 v1, v2, s2, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX942-NEXT: v_and_or_b32 v2, v1, s2, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3641,16 +3630,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3667,17 +3656,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3694,15 +3682,15 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
@@ -3719,13 +3707,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3741,13 +3729,13 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX908-NEXT: v_and_or_b32 v2, v1, s6, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3763,14 +3751,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3786,16 +3774,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3812,16 +3800,16 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -3957,27 +3945,27 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB14_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16:
@@ -4124,25 +4112,25 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fsub_ret_bf16:
@@ -4416,27 +4404,27 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset:
@@ -4589,25 +4577,25 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fsub_ret_bf16__offset:
@@ -4771,38 +4759,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -4823,37 +4811,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -4879,22 +4867,22 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4907,38 +4895,38 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4953,37 +4941,37 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4998,28 +4986,28 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX10-NEXT: v_and_b32_e32 v1, -4, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ds_read_b32 v3, v1
-; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX10-NEXT: ds_read_b32 v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v2, v2
+; GFX10-NEXT: v_not_b32_e32 v3, v3
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v1, v2, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
@@ -5042,20 +5030,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5077,20 +5065,20 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB16_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5112,22 +5100,22 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5148,18 +5136,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5180,18 +5168,18 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v0, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5222,30 +5210,29 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5276,29 +5263,28 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -5325,22 +5311,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX942-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v5, v5, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB17_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5364,29 +5350,28 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5412,28 +5397,27 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -5456,21 +5440,21 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -5494,20 +5478,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5530,20 +5514,20 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB17_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5566,22 +5550,22 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5603,18 +5587,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5636,18 +5620,18 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v3, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5749,14 +5733,13 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX942-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1
; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -5771,6 +5754,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5887,14 +5871,13 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65534
+; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65534
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1
; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
@@ -5908,6 +5891,7 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6047,27 +6031,26 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -6090,26 +6073,25 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -6130,21 +6112,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX942-NEXT: v_add3_u32 v3, v3, v1, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX942-NEXT: v_and_or_b32 v1, v2, s3, v1
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v1, s3, v2
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB19_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6160,26 +6142,25 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6197,25 +6178,24 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6232,21 +6212,21 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -6264,20 +6244,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX90A-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX90A-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX90A-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX90A-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6294,20 +6274,20 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT: v_add3_u32 v3, v3, v1, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX908-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v3, v3, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_and_or_b32 v2, v1, s7, v2
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6322,22 +6302,22 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6353,16 +6333,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65534
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65534
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6379,16 +6359,16 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6438,17 +6418,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX942-LABEL: local_atomic_fsub_ret_v2f16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v3, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB20_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6507,17 +6487,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX90A-LABEL: local_atomic_fsub_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6693,17 +6673,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX942-LABEL: local_atomic_fsub_ret_v2f16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6762,17 +6742,17 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX90A-LABEL: local_atomic_fsub_ret_v2f16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6926,14 +6906,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -6952,12 +6931,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB22_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6972,14 +6951,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -6996,13 +6974,13 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
@@ -7018,12 +6996,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7038,12 +7016,12 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7059,14 +7037,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7171,14 +7149,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -7197,12 +7174,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX942-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB23_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7217,14 +7194,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -7241,13 +7217,13 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
@@ -7263,12 +7239,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7283,12 +7259,12 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB23_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7304,14 +7280,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7518,41 +7494,41 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-LABEL: local_atomic_fsub_ret_v2bf16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v3, v0
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB24_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16:
@@ -7690,40 +7666,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v3, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fsub_ret_v2bf16:
@@ -7996,41 +7972,41 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[2:3], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v1, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v1, v1
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v2, v5, v2, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB25_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset:
@@ -8168,40 +8144,40 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v1, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: local_atomic_fsub_ret_v2bf16__offset:
@@ -8381,34 +8357,31 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -8433,33 +8406,32 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -8482,27 +8454,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8520,32 +8492,30 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -8566,32 +8536,30 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -8611,27 +8579,27 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
@@ -8651,26 +8619,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8689,26 +8657,26 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8726,29 +8694,29 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8847,34 +8815,31 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -8899,33 +8864,32 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX12-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
-; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
-; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
@@ -8948,27 +8912,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX942-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX942-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX942-NEXT: v_add3_u32 v6, v6, v4, s4
; GFX942-NEXT: v_add3_u32 v8, v8, v5, s4
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1]
-; GFX942-NEXT: v_perm_b32 v3, v5, v3, s5
-; GFX942-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX942-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX942-NEXT: v_perm_b32 v4, v5, v4, s5
+; GFX942-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8986,32 +8950,30 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v5, v4 offset:65532
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v5, v3 offset:65532
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -9032,32 +8994,30 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0
-; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 offset:65532
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
@@ -9077,27 +9037,27 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX10-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4
-; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4
+; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v4
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB27_1
@@ -9117,26 +9077,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9155,26 +9115,26 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX908-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8
; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9
-; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_cbranch_execnz .LBB27_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9192,29 +9152,29 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v4, v3 offset:65532
+; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_cbranch_execnz .LBB27_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9339,17 +9299,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX942-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v1, v0
+; GFX942-NEXT: ds_read_b32 v2, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9408,17 +9368,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX90A-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v1, v0
+; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9529,14 +9489,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -9555,12 +9514,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX942-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX942-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX942-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB29_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9575,14 +9534,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -9599,13 +9557,13 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_add_f32_e32 v1, -4.0, v2
+; GFX10-NEXT: v_add_f32_e32 v2, -4.0, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB29_1
@@ -9621,12 +9579,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX90A-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9641,12 +9599,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v1
-; GFX908-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX908-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX908-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB29_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9662,12 +9620,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX8-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB29_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9683,12 +9641,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9704,12 +9662,12 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v2, -4.0, v1
+; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index bcece19ae5fdd..5c90957edd9f5 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -16,11 +16,11 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
; GCN-NEXT: s_cbranch_execz .LBB0_4
; GCN-NEXT: ; %bb.3: ; %.then
; GCN-NEXT: s_or_saveexec_b32 s1, -1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1
-; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v4, -1
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen
diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
index ce40085feb0d0..895b68b5a9145 100644
--- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
+++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
@@ -25,7 +25,8 @@ define i32 @test(i32, i32) local_unnamed_addr #0 {
%11 = sub nsw i32 %7, %9
%12 = icmp slt i32 %10, %11
br i1 %12, label %5, label %13
-; CHECK: if r2 s> r1 goto -10 <test+0x40>
+; CHECK: r1 = r3
+; CHECK: if r2 s> r3 goto -10 <test+0x40>
; <label>:13: ; preds = %5, %2
%14 = phi i32 [ 0, %2 ], [ %9, %5 ]
diff --git a/llvm/test/CodeGen/Hexagon/swp-stages5.ll b/llvm/test/CodeGen/Hexagon/swp-stages5.ll
index d6c478299fb36..f3bc8891b4202 100644
--- a/llvm/test/CodeGen/Hexagon/swp-stages5.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-stages5.ll
@@ -8,7 +8,6 @@
; CHECK-DAG: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: = and([[REG0]],#255)
-; CHECK: [[REG0]]{{[:0-9]*}} =
; CHECK: endloop
define void @fred(ptr noalias nocapture %src, i32 %srcWidth, i32 %srcHeight, i32 %srcStride, ptr noalias nocapture %dst, i32 %dstStride) #0 {
diff --git a/llvm/test/CodeGen/NVPTX/atomics-b128.ll b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
index 3057e91e8ebe4..b2a3f94d11a16 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-b128.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-b128.ll
@@ -756,24 +756,24 @@ define i128 @test_atomicrmw_and(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB34_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: and.b64 %rd6, %rd1, %rd4;
-; CHECK-NEXT: and.b64 %rd7, %rd2, %rd5;
+; CHECK-NEXT: and.b64 %rd6, %rd11, %rd4;
+; CHECK-NEXT: and.b64 %rd7, %rd12, %rd5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p1 bra $L__BB34_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw and ptr %ptr, i128 %val monotonic
ret i128 %ret
@@ -791,24 +791,24 @@ define i128 @test_atomicrmw_or(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB35_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: or.b64 %rd6, %rd1, %rd4;
-; CHECK-NEXT: or.b64 %rd7, %rd2, %rd5;
+; CHECK-NEXT: or.b64 %rd6, %rd11, %rd4;
+; CHECK-NEXT: or.b64 %rd7, %rd12, %rd5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p1 bra $L__BB35_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw or ptr %ptr, i128 %val monotonic
ret i128 %ret
@@ -826,24 +826,24 @@ define i128 @test_atomicrmw_xor(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB36_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: xor.b64 %rd6, %rd1, %rd4;
-; CHECK-NEXT: xor.b64 %rd7, %rd2, %rd5;
+; CHECK-NEXT: xor.b64 %rd6, %rd11, %rd4;
+; CHECK-NEXT: xor.b64 %rd7, %rd12, %rd5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd6, %rd7};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p1, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p1 bra $L__BB36_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw xor ptr %ptr, i128 %val monotonic
ret i128 %ret
@@ -861,29 +861,29 @@ define i128 @test_atomicrmw_min(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB37_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd4;
-; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5;
+; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4;
+; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5;
; CHECK-NEXT: and.pred %p3, %p2, %p1;
-; CHECK-NEXT: setp.lt.s64 %p4, %rd2, %rd5;
+; CHECK-NEXT: setp.lt.s64 %p4, %rd12, %rd5;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
-; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5;
-; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5;
+; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5;
+; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p6 bra $L__BB37_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw min ptr %ptr, i128 %val monotonic
ret i128 %ret
@@ -901,29 +901,29 @@ define i128 @test_atomicrmw_max(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB38_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd4;
-; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5;
+; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4;
+; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5;
; CHECK-NEXT: and.pred %p3, %p2, %p1;
-; CHECK-NEXT: setp.gt.s64 %p4, %rd2, %rd5;
+; CHECK-NEXT: setp.gt.s64 %p4, %rd12, %rd5;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
-; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5;
-; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5;
+; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5;
+; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p6 bra $L__BB38_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw max ptr %ptr, i128 %val monotonic
ret i128 %ret
@@ -941,29 +941,29 @@ define i128 @test_atomicrmw_umin(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB39_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd4;
-; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5;
+; CHECK-NEXT: setp.lt.u64 %p1, %rd11, %rd4;
+; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5;
; CHECK-NEXT: and.pred %p3, %p2, %p1;
-; CHECK-NEXT: setp.lt.u64 %p4, %rd2, %rd5;
+; CHECK-NEXT: setp.lt.u64 %p4, %rd12, %rd5;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
-; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5;
-; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5;
+; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5;
+; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p6 bra $L__BB39_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw umin ptr %ptr, i128 %val monotonic
ret i128 %ret
@@ -981,29 +981,29 @@ define i128 @test_atomicrmw_umax(ptr %ptr, i128 %val) {
; CHECK-NEXT: ld.v2.b64 {%rd11, %rd12}, [%rd3];
; CHECK-NEXT: $L__BB40_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b64 %rd2, %rd12;
-; CHECK-NEXT: mov.b64 %rd1, %rd11;
-; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd4;
-; CHECK-NEXT: setp.eq.b64 %p2, %rd2, %rd5;
+; CHECK-NEXT: setp.gt.u64 %p1, %rd11, %rd4;
+; CHECK-NEXT: setp.eq.b64 %p2, %rd12, %rd5;
; CHECK-NEXT: and.pred %p3, %p2, %p1;
-; CHECK-NEXT: setp.gt.u64 %p4, %rd2, %rd5;
+; CHECK-NEXT: setp.gt.u64 %p4, %rd12, %rd5;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
-; CHECK-NEXT: selp.b64 %rd6, %rd2, %rd5, %p5;
-; CHECK-NEXT: selp.b64 %rd7, %rd1, %rd4, %p5;
+; CHECK-NEXT: selp.b64 %rd6, %rd12, %rd5, %p5;
+; CHECK-NEXT: selp.b64 %rd7, %rd11, %rd4, %p5;
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b128 cmp, swap, dst;
-; CHECK-NEXT: mov.b128 cmp, {%rd1, %rd2};
+; CHECK-NEXT: mov.b128 cmp, {%rd11, %rd12};
; CHECK-NEXT: mov.b128 swap, {%rd7, %rd6};
; CHECK-NEXT: atom.relaxed.sys.cas.b128 dst, [%rd3], cmp, swap;
-; CHECK-NEXT: mov.b128 {%rd11, %rd12}, dst;
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, dst;
; CHECK-NEXT: }
-; CHECK-NEXT: xor.b64 %rd8, %rd12, %rd2;
-; CHECK-NEXT: xor.b64 %rd9, %rd11, %rd1;
+; CHECK-NEXT: xor.b64 %rd8, %rd2, %rd12;
+; CHECK-NEXT: xor.b64 %rd9, %rd1, %rd11;
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd8;
; CHECK-NEXT: setp.ne.b64 %p6, %rd10, 0;
+; CHECK-NEXT: mov.b64 %rd11, %rd1;
+; CHECK-NEXT: mov.b64 %rd12, %rd2;
; CHECK-NEXT: @%p6 bra $L__BB40_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd11, %rd12};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
%ret = atomicrmw umax ptr %ptr, i128 %val monotonic
ret i128 %ret
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 313be95c03192..e2762bac45a35 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -63,32 +63,32 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: ld.b32 %r46, [%r1];
; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: mov.b32 %r4, %r46;
-; CHECKPTX62-NEXT: shr.u32 %r20, %r4, %r2;
+; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2;
; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20;
; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1;
; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3;
; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2;
-; CHECKPTX62-NEXT: and.b32 %r23, %r4, %r3;
+; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3;
; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22;
-; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24;
-; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r46, %r4;
+; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
+; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46;
+; CHECKPTX62-NEXT: mov.b32 %r46, %r4;
; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44
; CHECKPTX62-NEXT: ld.b32 %r47, [%r1];
; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: mov.b32 %r5, %r47;
-; CHECKPTX62-NEXT: shr.u32 %r25, %r5, %r2;
+; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2;
; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25;
; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00;
; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5;
; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6;
; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2;
-; CHECKPTX62-NEXT: and.b32 %r28, %r5, %r3;
+; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3;
; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27;
-; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29;
-; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r47, %r5;
+; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
+; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47;
+; CHECKPTX62-NEXT: mov.b32 %r47, %r5;
; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26
; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4;
@@ -100,16 +100,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6];
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: mov.b32 %r9, %r48;
-; CHECKPTX62-NEXT: shr.u32 %r33, %r9, %r7;
+; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7;
; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33;
; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1;
; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8;
; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7;
-; CHECKPTX62-NEXT: and.b32 %r36, %r9, %r8;
+; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8;
; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37;
-; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r48, %r9;
+; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
+; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48;
+; CHECKPTX62-NEXT: mov.b32 %r48, %r9;
; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8
; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4;
@@ -121,16 +121,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10];
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: mov.b32 %r13, %r49;
-; CHECKPTX62-NEXT: shr.u32 %r41, %r13, %r11;
+; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11;
; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41;
; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1;
; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10;
; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11;
-; CHECKPTX62-NEXT: and.b32 %r44, %r13, %r12;
+; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12;
; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45;
-; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r49, %r13;
+; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
+; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49;
+; CHECKPTX62-NEXT: mov.b32 %r49, %r13;
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX62-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index f5eefaa57fc09..e6c6a73eef14d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -63,33 +63,33 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: ld.b32 %r46, [%r1];
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: mov.b32 %r4, %r46;
-; CHECKPTX71-NEXT: shr.u32 %r20, %r4, %r2;
+; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20;
; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80;
; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4;
; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2;
-; CHECKPTX71-NEXT: and.b32 %r23, %r4, %r3;
+; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3;
; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22;
-; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r46, [%r1], %r4, %r24;
-; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r46, %r4;
+; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
+; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46;
+; CHECKPTX71-NEXT: mov.b32 %r46, %r4;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
; CHECKPTX71-NEXT: ld.b32 %r47, [%r1];
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: mov.b32 %r5, %r47;
-; CHECKPTX71-NEXT: shr.u32 %r25, %r5, %r2;
+; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25;
; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80;
; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7;
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT: and.b32 %r28, %r5, %r3;
+; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3;
; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27;
-; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r47, [%r1], %r5, %r29;
-; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r47, %r5;
+; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
+; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47;
+; CHECKPTX71-NEXT: mov.b32 %r47, %r5;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4;
@@ -101,17 +101,17 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: mov.b32 %r9, %r48;
-; CHECKPTX71-NEXT: shr.u32 %r33, %r9, %r7;
+; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33;
; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80;
; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10;
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7;
-; CHECKPTX71-NEXT: and.b32 %r36, %r9, %r8;
+; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r48, [%r6], %r9, %r37;
-; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r48, %r9;
+; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
+; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48;
+; CHECKPTX71-NEXT: mov.b32 %r48, %r9;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4;
@@ -123,17 +123,17 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: mov.b32 %r13, %r49;
-; CHECKPTX71-NEXT: shr.u32 %r41, %r13, %r11;
+; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41;
; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80;
; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13;
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT: and.b32 %r44, %r13, %r12;
+; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r49, [%r10], %r13, %r45;
-; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r49, %r13;
+; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
+; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49;
+; CHECKPTX71-NEXT: mov.b32 %r49, %r13;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX71-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index a4b49f7136d1d..6ea02f35e9626 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -442,22 +442,22 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
; CHECK-NEXT: cvt.f32.f16 %r10, %rs1;
; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov.b32 %r3, %r17;
-; CHECK-NEXT: shr.u32 %r8, %r3, %r1;
+; CHECK-NEXT: shr.u32 %r8, %r17, %r1;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r8;
; CHECK-NEXT: cvt.f32.f16 %r9, %rs2;
; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10;
; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r11;
; CHECK-NEXT: cvt.u32.u16 %r12, %rs3;
; CHECK-NEXT: shl.b32 %r13, %r12, %r1;
-; CHECK-NEXT: and.b32 %r14, %r3, %r2;
+; CHECK-NEXT: and.b32 %r14, %r17, %r2;
; CHECK-NEXT: or.b32 %r15, %r14, %r13;
; CHECK-NEXT: membar.sys;
-; CHECK-NEXT: atom.cas.b32 %r17, [%rd1], %r3, %r15;
-; CHECK-NEXT: setp.ne.b32 %p1, %r17, %r3;
+; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r17, %r15;
+; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r17;
+; CHECK-NEXT: mov.b32 %r17, %r3;
; CHECK-NEXT: @%p1 bra $L__BB24_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: shr.u32 %r16, %r17, %r1;
+; CHECK-NEXT: shr.u32 %r16, %r3, %r1;
; CHECK-NEXT: st.param.b16 [func_retval0], %r16;
; CHECK-NEXT: ret;
%ret = atomicrmw fadd ptr %addr, half %val seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
index fdb01314a7d4c..d6dd959365401 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-fp128.ll
@@ -49,15 +49,15 @@ define void @fmul_ctrloop_fp128() nounwind {
; PWR8-NEXT: #
; PWR8-NEXT: lxvd2x 0, 30, 28
; PWR8-NEXT: vmr 2, 31
-; PWR8-NEXT: mr 26, 30
-; PWR8-NEXT: addi 30, 30, 16
+; PWR8-NEXT: addi 26, 30, 16
; PWR8-NEXT: xxswapd 35, 0
; PWR8-NEXT: bl __mulkf3
; PWR8-NEXT: nop
; PWR8-NEXT: addi 29, 29, -1
; PWR8-NEXT: xxswapd 0, 34
; PWR8-NEXT: cmpldi 29, 0
-; PWR8-NEXT: stxvd2x 0, 26, 27
+; PWR8-NEXT: stxvd2x 0, 30, 27
+; PWR8-NEXT: mr 30, 26
; PWR8-NEXT: bc 12, 1, .LBB0_1
; PWR8-NEXT: # %bb.2: # %for.end
; PWR8-NEXT: li 3, 48
diff --git a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
index 786988fae08c8..826e3060464a3 100644
--- a/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
+++ b/llvm/test/CodeGen/PowerPC/licm-xxsplti.ll
@@ -23,11 +23,11 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
; AIX64-NEXT: # %bb.2: # %for.body.preheader.new
; AIX64-NEXT: rlwinm 6, 5, 0, 1, 30
; AIX64-NEXT: xxspltib 0, 6
-; AIX64-NEXT: addi 11, 4, -8
+; AIX64-NEXT: addi 9, 4, -8
; AIX64-NEXT: addi 7, 3, -8
; AIX64-NEXT: li 8, 8
-; AIX64-NEXT: li 9, 12
-; AIX64-NEXT: li 10, 4
+; AIX64-NEXT: li 10, 12
+; AIX64-NEXT: li 11, 4
; AIX64-NEXT: addi 6, 6, -2
; AIX64-NEXT: rldicl 6, 6, 63, 1
; AIX64-NEXT: addi 6, 6, 1
@@ -36,16 +36,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
; AIX64-NEXT: .align 4
; AIX64-NEXT: L..BB0_3: # %for.body
; AIX64-NEXT: #
-; AIX64-NEXT: lxvwsx 1, 11, 8
+; AIX64-NEXT: lxvwsx 1, 9, 8
; AIX64-NEXT: addi 6, 6, 2
; AIX64-NEXT: xxland 1, 1, 0
; AIX64-NEXT: xscvspdpn 1, 1
; AIX64-NEXT: stfsu 1, 8(7)
-; AIX64-NEXT: lxvwsx 1, 11, 9
-; AIX64-NEXT: addi 11, 11, 8
+; AIX64-NEXT: lxvwsx 1, 9, 10
+; AIX64-NEXT: addi 9, 9, 8
; AIX64-NEXT: xxland 1, 1, 0
; AIX64-NEXT: xxsldwi 1, 1, 1, 3
-; AIX64-NEXT: stfiwx 1, 7, 10
+; AIX64-NEXT: stfiwx 1, 7, 11
; AIX64-NEXT: bdnz L..BB0_3
; AIX64-NEXT: L..BB0_4: # %for.cond.cleanup.loopexit.unr-lcssa
; AIX64-NEXT: andi. 5, 5, 1
@@ -79,16 +79,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
; AIX32-NEXT: L..BB0_3: # %for.body
; AIX32-NEXT: #
; AIX32-NEXT: lxvwsx 1, 12, 9
-; AIX32-NEXT: lxvwsx 2, 12, 10
; AIX32-NEXT: addic 6, 6, 2
-; AIX32-NEXT: addi 12, 12, 8
; AIX32-NEXT: addze 11, 11
; AIX32-NEXT: xor 0, 6, 7
; AIX32-NEXT: or. 0, 0, 11
; AIX32-NEXT: xxland 1, 1, 0
; AIX32-NEXT: xscvspdpn 1, 1
; AIX32-NEXT: stfsu 1, 8(8)
-; AIX32-NEXT: xxland 1, 2, 0
+; AIX32-NEXT: lxvwsx 1, 12, 10
+; AIX32-NEXT: addi 12, 12, 8
+; AIX32-NEXT: xxland 1, 1, 0
; AIX32-NEXT: xscvspdpn 1, 1
; AIX32-NEXT: stfs 1, 4(8)
; AIX32-NEXT: bne 0, L..BB0_3
@@ -116,11 +116,11 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
; LINUX64LE-NEXT: # %bb.2: # %for.body.preheader.new
; LINUX64LE-NEXT: rlwinm 6, 5, 0, 1, 30
; LINUX64LE-NEXT: xxspltib 0, 6
-; LINUX64LE-NEXT: addi 11, 4, -8
+; LINUX64LE-NEXT: addi 8, 4, -8
; LINUX64LE-NEXT: addi 7, 3, -8
-; LINUX64LE-NEXT: li 8, 8
-; LINUX64LE-NEXT: li 9, 12
-; LINUX64LE-NEXT: li 10, 4
+; LINUX64LE-NEXT: li 9, 8
+; LINUX64LE-NEXT: li 10, 12
+; LINUX64LE-NEXT: li 11, 4
; LINUX64LE-NEXT: addi 6, 6, -2
; LINUX64LE-NEXT: rldicl 6, 6, 63, 1
; LINUX64LE-NEXT: addi 6, 6, 1
@@ -129,16 +129,16 @@ define void @_Z3fooPfS_Pi(ptr noalias nocapture noundef %_a, ptr noalias nocaptu
; LINUX64LE-NEXT: .p2align 4
; LINUX64LE-NEXT: .LBB0_3: # %for.body
; LINUX64LE-NEXT: #
-; LINUX64LE-NEXT: lxvwsx 1, 11, 8
+; LINUX64LE-NEXT: lxvwsx 1, 8, 9
; LINUX64LE-NEXT: addi 6, 6, 2
; LINUX64LE-NEXT: xxland 1, 1, 0
; LINUX64LE-NEXT: xxsldwi 1, 1, 1, 3
; LINUX64LE-NEXT: xscvspdpn 1, 1
; LINUX64LE-NEXT: stfsu 1, 8(7)
-; LINUX64LE-NEXT: lxvwsx 1, 11, 9
-; LINUX64LE-NEXT: addi 11, 11, 8
+; LINUX64LE-NEXT: lxvwsx 1, 8, 10
+; LINUX64LE-NEXT: addi 8, 8, 8
; LINUX64LE-NEXT: xxland 1, 1, 0
-; LINUX64LE-NEXT: stxvrwx 1, 7, 10
+; LINUX64LE-NEXT: stxvrwx 1, 7, 11
; LINUX64LE-NEXT: bdnz .LBB0_3
; LINUX64LE-NEXT: .LBB0_4: # %for.cond.cleanup.loopexit.unr-lcssa
; LINUX64LE-NEXT: andi. 5, 5, 1
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index 4e0394ee4fb8c..cc38e250f183f 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -189,8 +189,8 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: cmplwi r4, 0
; CHECK-NEXT: beq cr0, .LBB2_4
; CHECK-NEXT: # %bb.1: # %bb3.preheader
+; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
-; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
; CHECK-NEXT: addi r10, r3, 4002
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: li r5, -1
@@ -198,6 +198,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: li r7, 3
; CHECK-NEXT: li r8, 5
; CHECK-NEXT: li r9, 9
+; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
@@ -214,7 +215,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: ldx r28, r10, r8
; CHECK-NEXT: ld r27, 12(r10)
; CHECK-NEXT: ld r26, 8(r10)
-; CHECK-NEXT: ldx r12, r10, r9
+; CHECK-NEXT: ldx r25, r10, r9
; CHECK-NEXT: addi r10, r10, 1
; CHECK-NEXT: mulld r11, r11, r0
; CHECK-NEXT: mulld r11, r11, r30
@@ -222,7 +223,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: mulld r11, r11, r28
; CHECK-NEXT: mulld r11, r11, r27
; CHECK-NEXT: mulld r11, r11, r26
-; CHECK-NEXT: maddld r3, r11, r12, r3
+; CHECK-NEXT: maddld r3, r11, r25, r3
; CHECK-NEXT: bdnz .LBB2_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
@@ -231,6 +232,7 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r3, r3, r4
; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
+; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: addi r3, r4, 0
diff --git a/llvm/test/CodeGen/PowerPC/sink-side-effect.ll b/llvm/test/CodeGen/PowerPC/sink-side-effect.ll
index 040c20b0e4cf7..94d2a090bce10 100644
--- a/llvm/test/CodeGen/PowerPC/sink-side-effect.ll
+++ b/llvm/test/CodeGen/PowerPC/sink-side-effect.ll
@@ -23,7 +23,7 @@ define double @zot(ptr %arg, ptr %arg1, ptr %arg2) {
; CHECK-NEXT: cmpw 4, 3
; CHECK-NEXT: bge 0, .LBB0_3
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: xsmuldp 1, 1, 0
+; CHECK-NEXT: xsmuldp 0, 0, 1
; CHECK-NEXT: b .LBB0_3
bb:
%tmp = load i32, ptr %arg, align 8
diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
index 509457042ed68..516d54ba2fdbe 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll
@@ -26,12 +26,11 @@ define void @main() nounwind #0 {
; CHECK-NEXT: mullw 4, 6, 6
; CHECK-NEXT: addi 5, 6, 1
; CHECK-NEXT: bdz .LBB0_3
-; CHECK-NEXT: .p2align 5
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT: mr 6, 5
; CHECK-NEXT: stwu 4, 4(3)
+; CHECK-NEXT: mullw 4, 5, 5
; CHECK-NEXT: addi 5, 5, 1
-; CHECK-NEXT: mullw 4, 6, 6
; CHECK-NEXT: bdnz .LBB0_2
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: stwu 4, 4(3)
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
index 9cb2d4444b974..871aab324a99b 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -145,14 +145,14 @@ declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>)
; CHECK32-NEXT: .align 4
; CHECK32-NEXT: [[L2_foo:.*]]: # %for.body
; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: slwi r8, r7, 4
-; CHECK32-NEXT: addic r7, r7, 1
-; CHECK32-NEXT: addze r6, r6
+; CHECK32-NEXT: slwi r8, r6, 4
+; CHECK32-NEXT: addic r6, r6, 1
+; CHECK32-NEXT: addze r7, r7
; CHECK32-NEXT: lxvx vs2, r4, r8
; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
; CHECK32-NEXT: stxvx vs2, r3, r8
-; CHECK32-NEXT: xor r8, r7, r5
-; CHECK32-NEXT: or. r8, r8, r6
+; CHECK32-NEXT: xor r8, r6, r5
+; CHECK32-NEXT: or. r8, r8, r7
; CHECK32-NEXT: bne cr0, [[L2_foo]]
; CHECK32: .foo:
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 2aec92eca145f..02aeebdeb3775 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -127,11 +127,13 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
; RV32-NEXT: .LBB3_2: # %while.body
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
; RV32-NEXT: lw a3, 0(a1)
-; RV32-NEXT: addi a1, a1, 4
+; RV32-NEXT: addi a4, a1, 4
; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: addi a1, a0, 4
; RV32-NEXT: sw a3, 0(a0)
-; RV32-NEXT: addi a0, a0, 4
-; RV32-NEXT: bne a1, a2, .LBB3_2
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: mv a1, a4
+; RV32-NEXT: bne a4, a2, .LBB3_2
; RV32-NEXT: .LBB3_3: # %while.end
; RV32-NEXT: li a0, 0
; RV32-NEXT: ret
@@ -149,11 +151,13 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
; RV64-NEXT: .LBB3_2: # %while.body
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
; RV64-NEXT: lw a3, 0(a1)
-; RV64-NEXT: addi a1, a1, 4
+; RV64-NEXT: addi a4, a1, 4
; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: addi a1, a0, 4
; RV64-NEXT: sw a3, 0(a0)
-; RV64-NEXT: addi a0, a0, 4
-; RV64-NEXT: bne a1, a2, .LBB3_2
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: mv a1, a4
+; RV64-NEXT: bne a4, a2, .LBB3_2
; RV64-NEXT: .LBB3_3: # %while.end
; RV64-NEXT: li a0, 0
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 2b800c449953b..3250821a92534 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -44,8 +44,9 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
; CHECK-LABEL: m2_splat_with_tail:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v10, v8
-; CHECK-NEXT: vrgather.vi v8, v10, 0
+; CHECK-NEXT: vrgather.vi v10, v8, 0
+; CHECK-NEXT: vmv1r.v v11, v9
+; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
ret <4 x i64> %res
@@ -98,8 +99,9 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
; CHECK-LABEL: m2_splat_into_identity:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v10, v8
-; CHECK-NEXT: vrgather.vi v8, v10, 0
+; CHECK-NEXT: vrgather.vi v10, v8, 0
+; CHECK-NEXT: vmv1r.v v11, v9
+; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
ret <4 x i64> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index a4c793b49d54a..ab9849631663c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -36,7 +36,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: .cfi_offset s10, -96
; CHECK-NEXT: .cfi_offset s11, -104
; CHECK-NEXT: li a6, 0
-; CHECK-NEXT: li a7, 8
+; CHECK-NEXT: li s2, 8
; CHECK-NEXT: li t0, 12
; CHECK-NEXT: li s0, 4
; CHECK-NEXT: li t1, 20
@@ -45,7 +45,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: andi t3, a4, 1
-; CHECK-NEXT: li s2, 4
+; CHECK-NEXT: li t2, 4
; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader.i
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB0_2 Depth 2
@@ -53,9 +53,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: # Child Loop BB0_4 Depth 4
; CHECK-NEXT: # Child Loop BB0_5 Depth 5
; CHECK-NEXT: mv t4, t1
-; CHECK-NEXT: mv t2, s2
+; CHECK-NEXT: mv t5, t2
; CHECK-NEXT: mv t6, t0
-; CHECK-NEXT: mv s3, a7
+; CHECK-NEXT: mv a7, s2
; CHECK-NEXT: mv s4, a6
; CHECK-NEXT: .LBB0_2: # %for.cond5.preheader.i
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
@@ -64,9 +64,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: # Child Loop BB0_4 Depth 4
; CHECK-NEXT: # Child Loop BB0_5 Depth 5
; CHECK-NEXT: mv s5, t4
-; CHECK-NEXT: mv t5, t2
+; CHECK-NEXT: mv s6, t5
; CHECK-NEXT: mv s7, t6
-; CHECK-NEXT: mv s8, s3
+; CHECK-NEXT: mv s3, a7
; CHECK-NEXT: mv s9, s4
; CHECK-NEXT: .LBB0_3: # %for.cond9.preheader.i
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
@@ -75,9 +75,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: # Child Loop BB0_4 Depth 4
; CHECK-NEXT: # Child Loop BB0_5 Depth 5
; CHECK-NEXT: mv s11, s5
-; CHECK-NEXT: mv s6, t5
+; CHECK-NEXT: mv a3, s6
; CHECK-NEXT: mv ra, s7
-; CHECK-NEXT: mv a5, s8
+; CHECK-NEXT: mv s8, s3
; CHECK-NEXT: mv s1, s9
; CHECK-NEXT: .LBB0_4: # %vector.ph.i
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
@@ -92,44 +92,45 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
; CHECK-NEXT: # Parent Loop BB0_3 Depth=3
; CHECK-NEXT: # Parent Loop BB0_4 Depth=4
; CHECK-NEXT: # => This Inner Loop Header: Depth=5
-; CHECK-NEXT: add a4, a5, a1
-; CHECK-NEXT: add a3, s6, a1
-; CHECK-NEXT: addi a1, a1, 4
+; CHECK-NEXT: addi a5, a1, 4
+; CHECK-NEXT: add a4, s8, a1
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: vse32.v v8, (a4), v0.t
-; CHECK-NEXT: vse32.v v8, (a3), v0.t
-; CHECK-NEXT: bne a1, s0, .LBB0_5
+; CHECK-NEXT: vse32.v v8, (a1), v0.t
+; CHECK-NEXT: mv a1, a5
+; CHECK-NEXT: bne a5, s0, .LBB0_5
; CHECK-NEXT: # %bb.6: # %for.cond.cleanup15.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=4
; CHECK-NEXT: addi s1, s1, 4
-; CHECK-NEXT: addi a5, a5, 4
+; CHECK-NEXT: addi s8, s8, 4
; CHECK-NEXT: addi ra, ra, 4
-; CHECK-NEXT: addi s6, s6, 4
+; CHECK-NEXT: addi a3, a3, 4
; CHECK-NEXT: andi s10, a0, 1
; CHECK-NEXT: addi s11, s11, 4
; CHECK-NEXT: beqz s10, .LBB0_4
; CHECK-NEXT: # %bb.7: # %for.cond.cleanup11.i
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=3
; CHECK-NEXT: addi s9, s9, 4
-; CHECK-NEXT: addi s8, s8, 4
+; CHECK-NEXT: addi s3, s3, 4
; CHECK-NEXT: addi s7, s7, 4
-; CHECK-NEXT: addi t5, t5, 4
+; CHECK-NEXT: addi s6, s6, 4
; CHECK-NEXT: andi a1, a2, 1
; CHECK-NEXT: addi s5, s5, 4
; CHECK-NEXT: beqz a1, .LBB0_3
; CHECK-NEXT: # %bb.8: # %for.cond.cleanup7.i
; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=2
; CHECK-NEXT: addi s4, s4, 4
-; CHECK-NEXT: addi s3, s3, 4
+; CHECK-NEXT: addi a7, a7, 4
; CHECK-NEXT: addi t6, t6, 4
-; CHECK-NEXT: addi t2, t2, 4
+; CHECK-NEXT: addi t5, t5, 4
; CHECK-NEXT: addi t4, t4, 4
; CHECK-NEXT: beqz t3, .LBB0_2
; CHECK-NEXT: # %bb.9: # %for.cond.cleanup3.i
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: addi a6, a6, 4
-; CHECK-NEXT: addi a7, a7, 4
-; CHECK-NEXT: addi t0, t0, 4
; CHECK-NEXT: addi s2, s2, 4
+; CHECK-NEXT: addi t0, t0, 4
+; CHECK-NEXT: addi t2, t2, 4
; CHECK-NEXT: addi t1, t1, 4
; CHECK-NEXT: beqz a1, .LBB0_1
; CHECK-NEXT: # %bb.10: # %l.exit
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 95bff27fe8ca6..57f1977c27b82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -314,10 +314,13 @@ define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, p
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 5
-; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
@@ -326,7 +329,7 @@ define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, p
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-NEXT: vmv.v.i v16, 0
+; CHECK-NEXT: vmv.v.i v24, 0
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: srli a1, a2, 3
; CHECK-NEXT: slli a2, a2, 3
@@ -334,15 +337,6 @@ define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, p
; CHECK-NEXT: vmv.v.i v0, 0
; CHECK-NEXT: .LBB8_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: mv a5, a4
-; CHECK-NEXT: slli a4, a4, 3
-; CHECK-NEXT: add a5, a5, a4
-; CHECK-NEXT: slli a4, a4, 1
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, sp, a4
-; CHECK-NEXT: addi a4, a4, 16
-; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vmv.v.x v8, a0
; CHECK-NEXT: csrr a4, vlenb
; CHECK-NEXT: slli a5, a4, 4
@@ -350,33 +344,23 @@ define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, p
; CHECK-NEXT: add a4, sp, a4
; CHECK-NEXT: addi a4, a4, 16
; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: mv a5, a4
-; CHECK-NEXT: slli a4, a4, 3
-; CHECK-NEXT: add a5, a5, a4
-; CHECK-NEXT: slli a4, a4, 1
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, sp, a4
-; CHECK-NEXT: addi a4, a4, 16
-; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vand.vv v16, v16, v8
+; CHECK-NEXT: vand.vv v16, v0, v8
+; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: vmsne.vi v24, v16, 0
; CHECK-NEXT: csrr a4, vlenb
; CHECK-NEXT: slli a4, a4, 4
; CHECK-NEXT: add a4, sp, a4
; CHECK-NEXT: addi a4, a4, 16
; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vand.vv v16, v0, v8
-; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: vmv8r.v v24, v8
; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: mv a5, a4
-; CHECK-NEXT: slli a4, a4, 3
-; CHECK-NEXT: add a5, a5, a4
-; CHECK-NEXT: slli a4, a4, 1
-; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: slli a5, a4, 4
+; CHECK-NEXT: add a4, a5, a4
; CHECK-NEXT: add a4, sp, a4
; CHECK-NEXT: addi a4, a4, 16
-; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vand.vv v16, v24, v8
+; CHECK-NEXT: vmsne.vi v8, v16, 0
; CHECK-NEXT: csrr a4, vlenb
; CHECK-NEXT: slli a4, a4, 4
; CHECK-NEXT: add a4, sp, a4
@@ -397,19 +381,22 @@ define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, p
; CHECK-NEXT: addi a5, sp, 16
; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma
-; CHECK-NEXT: vor.vv v16, v16, v8
+; CHECK-NEXT: vor.vv v0, v0, v8
; CHECK-NEXT: csrr a5, vlenb
; CHECK-NEXT: slli a5, a5, 3
; CHECK-NEXT: add a5, sp, a5
; CHECK-NEXT: addi a5, a5, 16
; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vor.vv v0, v0, v8
+; CHECK-NEXT: vor.vv v24, v24, v8
; CHECK-NEXT: beqz a4, .LBB8_1
; CHECK-NEXT: # %bb.2: # %middle.block
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a2, a1, 5
-; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: add sp, sp, a1
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index 386c736128794..f295bd8d74df3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2258,18 +2258,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
; CHECK-RV32-NEXT: vsetvli a7, zero, e32, m2, ta, ma
; CHECK-RV32-NEXT: .LBB98_3: # %vector.body
; CHECK-RV32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-RV32-NEXT: mv a7, a6
-; CHECK-RV32-NEXT: slli t0, a6, 2
-; CHECK-RV32-NEXT: add a6, a6, a4
-; CHECK-RV32-NEXT: add t0, a0, t0
-; CHECK-RV32-NEXT: vl2re32.v v8, (t0)
-; CHECK-RV32-NEXT: sltu a7, a6, a7
-; CHECK-RV32-NEXT: add a5, a5, a7
-; CHECK-RV32-NEXT: xor a7, a6, a3
+; CHECK-RV32-NEXT: slli a7, a6, 2
+; CHECK-RV32-NEXT: add t0, a6, a4
+; CHECK-RV32-NEXT: add a7, a0, a7
+; CHECK-RV32-NEXT: vl2re32.v v8, (a7)
+; CHECK-RV32-NEXT: sltu a6, t0, a6
+; CHECK-RV32-NEXT: add a5, a5, a6
+; CHECK-RV32-NEXT: xor a6, t0, a3
; CHECK-RV32-NEXT: vand.vx v8, v8, a1
-; CHECK-RV32-NEXT: or a7, a7, a5
-; CHECK-RV32-NEXT: vs2r.v v8, (t0)
-; CHECK-RV32-NEXT: bnez a7, .LBB98_3
+; CHECK-RV32-NEXT: or t1, a6, a5
+; CHECK-RV32-NEXT: vs2r.v v8, (a7)
+; CHECK-RV32-NEXT: mv a6, t0
+; CHECK-RV32-NEXT: bnez t1, .LBB98_3
; CHECK-RV32-NEXT: # %bb.4: # %middle.block
; CHECK-RV32-NEXT: bnez a3, .LBB98_6
; CHECK-RV32-NEXT: .LBB98_5: # %for.body
@@ -2350,18 +2350,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
; CHECK-ZVKB-NOZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma
; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_3: # %vector.body
; CHECK-ZVKB-NOZBB32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-ZVKB-NOZBB32-NEXT: mv a7, a6
-; CHECK-ZVKB-NOZBB32-NEXT: slli t0, a6, 2
-; CHECK-ZVKB-NOZBB32-NEXT: add a6, a6, a4
-; CHECK-ZVKB-NOZBB32-NEXT: add t0, a0, t0
-; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (t0)
-; CHECK-ZVKB-NOZBB32-NEXT: sltu a7, a6, a7
-; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a7
-; CHECK-ZVKB-NOZBB32-NEXT: xor a7, a6, a3
+; CHECK-ZVKB-NOZBB32-NEXT: slli a7, a6, 2
+; CHECK-ZVKB-NOZBB32-NEXT: add t0, a6, a4
+; CHECK-ZVKB-NOZBB32-NEXT: add a7, a0, a7
+; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (a7)
+; CHECK-ZVKB-NOZBB32-NEXT: sltu a6, t0, a6
+; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a6
+; CHECK-ZVKB-NOZBB32-NEXT: xor a6, t0, a3
; CHECK-ZVKB-NOZBB32-NEXT: vandn.vx v8, v8, a1
-; CHECK-ZVKB-NOZBB32-NEXT: or a7, a7, a5
-; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (t0)
-; CHECK-ZVKB-NOZBB32-NEXT: bnez a7, .LBB98_3
+; CHECK-ZVKB-NOZBB32-NEXT: or t1, a6, a5
+; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (a7)
+; CHECK-ZVKB-NOZBB32-NEXT: mv a6, t0
+; CHECK-ZVKB-NOZBB32-NEXT: bnez t1, .LBB98_3
; CHECK-ZVKB-NOZBB32-NEXT: # %bb.4: # %middle.block
; CHECK-ZVKB-NOZBB32-NEXT: bnez a3, .LBB98_7
; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_5: # %for.body.preheader
@@ -2444,18 +2444,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
; CHECK-ZVKB-ZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma
; CHECK-ZVKB-ZBB32-NEXT: .LBB98_3: # %vector.body
; CHECK-ZVKB-ZBB32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-ZVKB-ZBB32-NEXT: mv a7, a6
-; CHECK-ZVKB-ZBB32-NEXT: slli t0, a6, 2
-; CHECK-ZVKB-ZBB32-NEXT: add a6, a6, a4
-; CHECK-ZVKB-ZBB32-NEXT: add t0, a0, t0
-; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (t0)
-; CHECK-ZVKB-ZBB32-NEXT: sltu a7, a6, a7
-; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a7
-; CHECK-ZVKB-ZBB32-NEXT: xor a7, a6, a3
+; CHECK-ZVKB-ZBB32-NEXT: slli a7, a6, 2
+; CHECK-ZVKB-ZBB32-NEXT: add t0, a6, a4
+; CHECK-ZVKB-ZBB32-NEXT: add a7, a0, a7
+; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (a7)
+; CHECK-ZVKB-ZBB32-NEXT: sltu a6, t0, a6
+; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a6
+; CHECK-ZVKB-ZBB32-NEXT: xor a6, t0, a3
; CHECK-ZVKB-ZBB32-NEXT: vandn.vx v8, v8, a1
-; CHECK-ZVKB-ZBB32-NEXT: or a7, a7, a5
-; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (t0)
-; CHECK-ZVKB-ZBB32-NEXT: bnez a7, .LBB98_3
+; CHECK-ZVKB-ZBB32-NEXT: or t1, a6, a5
+; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (a7)
+; CHECK-ZVKB-ZBB32-NEXT: mv a6, t0
+; CHECK-ZVKB-ZBB32-NEXT: bnez t1, .LBB98_3
; CHECK-ZVKB-ZBB32-NEXT: # %bb.4: # %middle.block
; CHECK-ZVKB-ZBB32-NEXT: bnez a3, .LBB98_6
; CHECK-ZVKB-ZBB32-NEXT: .LBB98_5: # %for.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
index 10440089cff10..ed6b7f1e6efb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
@@ -25,24 +25,24 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc
; RV32-NEXT: li a6, 0
; RV32-NEXT: .LBB0_4: # %vector.body
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32-NEXT: mv t0, a7
-; RV32-NEXT: slli t1, a7, 2
-; RV32-NEXT: addi a7, a7, 8
-; RV32-NEXT: add t1, a1, t1
+; RV32-NEXT: slli t0, a7, 2
+; RV32-NEXT: addi t1, a7, 8
+; RV32-NEXT: add t0, a1, t0
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vle32.v v8, (t1)
-; RV32-NEXT: sltu t0, a7, t0
-; RV32-NEXT: xor t1, a7, a5
-; RV32-NEXT: add a6, a6, t0
+; RV32-NEXT: vle32.v v8, (t0)
+; RV32-NEXT: sltu a7, t1, a7
+; RV32-NEXT: xor t0, t1, a5
+; RV32-NEXT: add a6, a6, a7
; RV32-NEXT: vmslt.vx v12, v8, a2
; RV32-NEXT: vcompress.vm v10, v8, v12
-; RV32-NEXT: vcpop.m t0, v12
-; RV32-NEXT: vsetvli zero, t0, e32, m2, ta, ma
+; RV32-NEXT: vcpop.m a7, v12
+; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma
; RV32-NEXT: vse32.v v10, (a0)
-; RV32-NEXT: slli t0, t0, 2
-; RV32-NEXT: or t1, t1, a6
-; RV32-NEXT: add a0, a0, t0
-; RV32-NEXT: bnez t1, .LBB0_4
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: or t0, t0, a6
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: mv a7, t1
+; RV32-NEXT: bnez t0, .LBB0_4
; RV32-NEXT: # %bb.5: # %middle.block
; RV32-NEXT: bne a5, a3, .LBB0_9
; RV32-NEXT: .LBB0_6: # %for.cond.cleanup
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index e8d89d4066e43..2293a1e6979f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -895,21 +895,21 @@ define void @coalesce_vl_clobber(ptr %p) {
; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
; CHECK-NEXT: .LBB43_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vsetvli a3, a2, e8, mf8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: slli a3, a1, 32
-; CHECK-NEXT: vsetvli a1, a2, e8, mf8, ta, ma
+; CHECK-NEXT: slli a1, a1, 32
; CHECK-NEXT: vsetivli zero, 0, e8, mf2, ta, mu
; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: srli a3, a3, 32
+; CHECK-NEXT: srli a1, a1, 32
; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT: vslideup.vx v10, v9, a3, v0.t
+; CHECK-NEXT: vslideup.vx v10, v9, a1, v0.t
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmsne.vi v0, v10, 0, v0.t
-; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: vse32.v v10, (a0), v0.t
; CHECK-NEXT: li a2, 1
+; CHECK-NEXT: mv a1, a3
; CHECK-NEXT: j .LBB43_1
entry:
br label %vector.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index af3b0852a6461..ead79fcf53d8b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -102,20 +102,20 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV32-NEXT: .LBB0_13: # %vector.body
; RV32-NEXT: # Parent Loop BB0_10 Depth=1
; RV32-NEXT: # => This Inner Loop Header: Depth=2
-; RV32-NEXT: mv s0, t6
-; RV32-NEXT: add t6, a2, t6
-; RV32-NEXT: add s1, a4, s0
-; RV32-NEXT: vl2r.v v8, (t6)
-; RV32-NEXT: add s2, a0, s0
+; RV32-NEXT: add s0, a2, t6
+; RV32-NEXT: add s1, a4, t6
+; RV32-NEXT: vl2r.v v8, (s0)
+; RV32-NEXT: add s0, a0, t6
; RV32-NEXT: vl2r.v v10, (s1)
-; RV32-NEXT: add t6, s0, t2
-; RV32-NEXT: sltu s0, t6, s0
-; RV32-NEXT: add t5, t5, s0
-; RV32-NEXT: xor s0, t6, t4
+; RV32-NEXT: add s1, t6, t2
+; RV32-NEXT: sltu t6, s1, t6
+; RV32-NEXT: add t5, t5, t6
+; RV32-NEXT: xor t6, s1, t4
; RV32-NEXT: vaaddu.vv v8, v8, v10
-; RV32-NEXT: or s0, s0, t5
-; RV32-NEXT: vs2r.v v8, (s2)
-; RV32-NEXT: bnez s0, .LBB0_13
+; RV32-NEXT: or s2, t6, t5
+; RV32-NEXT: vs2r.v v8, (s0)
+; RV32-NEXT: mv t6, s1
+; RV32-NEXT: bnez s2, .LBB0_13
; RV32-NEXT: # %bb.14: # %middle.block
; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1
; RV32-NEXT: beq t4, a6, .LBB0_9
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 98e082be4cad1..1769c5d2fd385 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -21,12 +21,11 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: it lt
; ENABLED-NEXT: bxlt lr
; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
; ENABLED-NEXT: mov r11, r0
-; ENABLED-NEXT: ldr r0, [sp, #36]
+; ENABLED-NEXT: ldr r0, [sp, #32]
; ENABLED-NEXT: add.w r9, r2, #3
; ENABLED-NEXT: mov.w r12, #0
-; ENABLED-NEXT: mov.w r8, #1
; ENABLED-NEXT: mov r10, r11
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
@@ -50,16 +49,18 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
-; ENABLED-NEXT: sub.w r4, r2, r12
+; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
+; ENABLED-NEXT: sub.w r4, r2, r12
; ENABLED-NEXT: vmov.i32 q1, #0x0
-; ENABLED-NEXT: mov r7, r10
-; ENABLED-NEXT: add.w r6, r8, r0, lsr #2
+; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
; ENABLED-NEXT: adds r0, r2, #3
; ENABLED-NEXT: sub.w r0, r0, r12
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: add.w lr, r8, r0, lsr #2
+; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
+; ENABLED-NEXT: mov r7, r10
+; ENABLED-NEXT: dls lr, r0
; ENABLED-NEXT: mov r0, r11
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
@@ -82,7 +83,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
; ENABLED-NEXT: .LBB0_8:
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
; ENABLED-NEXT: bx lr
;
; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
@@ -91,12 +92,11 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: it lt
; NOREDUCTIONS-NEXT: bxlt lr
; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
; NOREDUCTIONS-NEXT: mov r11, r0
-; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
+; NOREDUCTIONS-NEXT: ldr r0, [sp, #32]
; NOREDUCTIONS-NEXT: add.w r9, r2, #3
; NOREDUCTIONS-NEXT: mov.w r12, #0
-; NOREDUCTIONS-NEXT: mov.w r8, #1
; NOREDUCTIONS-NEXT: mov r10, r11
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
@@ -120,16 +120,18 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
-; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
+; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
+; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT: mov r7, r10
-; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2
+; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
; NOREDUCTIONS-NEXT: adds r0, r2, #3
; NOREDUCTIONS-NEXT: sub.w r0, r0, r12
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2
+; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT: mov r7, r10
+; NOREDUCTIONS-NEXT: dls lr, r0
; NOREDUCTIONS-NEXT: mov r0, r11
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
@@ -152,7 +154,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
; NOREDUCTIONS-NEXT: .LBB0_8:
-; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
; NOREDUCTIONS-NEXT: bx lr
entry:
%conv = sext i16 %N to i32
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index 435acc29f076e..cbcbf1f392ce8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,73 +165,74 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: wls lr, r1, .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: add.w r9, r3, #4
-; CHECK-NEXT: add.w r10, r0, #4
+; CHECK-NEXT: adds r6, r3, #4
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: mvn r8, #1
-; CHECK-NEXT: @ implicit-def: $r6
+; CHECK-NEXT: @ implicit-def: $r9
; CHECK-NEXT: @ implicit-def: $r4
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr.w r1, [r10]
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: asrs r2, r4, #31
-; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [r1]
; CHECK-NEXT: muls r1, r3, r1
; CHECK-NEXT: adds r4, r4, r1
; CHECK-NEXT: adc.w r1, r2, r1, asr #31
; CHECK-NEXT: adds.w r2, r4, #-2147483648
-; CHECK-NEXT: ldrd r5, r4, [r8]
-; CHECK-NEXT: adc r2, r1, #0
+; CHECK-NEXT: ldrd r2, r4, [r8]
+; CHECK-NEXT: adc r5, r1, #0
+; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: smull r4, r2, r4, r9
+; CHECK-NEXT: asrs r1, r5, #31
; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: smull r4, r5, r4, r6
-; CHECK-NEXT: asrs r1, r2, #31
-; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: subs r4, r2, r4
-; CHECK-NEXT: sbcs r1, r5
-; CHECK-NEXT: adds.w r6, r4, #-2147483648
-; CHECK-NEXT: ldr r4, [r10, #-4]
-; CHECK-NEXT: adc r11, r1, #0
-; CHECK-NEXT: mov r1, r9
-; CHECK-NEXT: add.w r10, r10, #4
+; CHECK-NEXT: subs r4, r5, r4
+; CHECK-NEXT: sbcs r1, r2
+; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds.w r10, r4, #-2147483648
+; CHECK-NEXT: adc r1, r1, #0
+; CHECK-NEXT: ldr r4, [r2, #-4]
; CHECK-NEXT: muls r4, r3, r4
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: adds.w r12, r4, #-2147483648
; CHECK-NEXT: asr.w r5, r4, #31
-; CHECK-NEXT: ldr.w r4, [r9]
+; CHECK-NEXT: ldr r4, [r6]
; CHECK-NEXT: adc r5, r5, #0
; CHECK-NEXT: mul r2, r4, r0
+; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: add.w r2, r2, #-2147483648
; CHECK-NEXT: asrl r12, r5, r2
-; CHECK-NEXT: smull r2, r9, r4, r12
-; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: lsll r2, r9, #30
-; CHECK-NEXT: asr.w r5, r9, #31
-; CHECK-NEXT: mov r2, r9
-; CHECK-NEXT: mov r9, r1
-; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT: lsll r2, r5, r4
-; CHECK-NEXT: lsrl r2, r5, #2
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r0, #2
-; CHECK-NEXT: lsll r2, r5, r0
-; CHECK-NEXT: add.w r0, r2, #-2147483648
+; CHECK-NEXT: smull r2, r5, r4, r12
+; CHECK-NEXT: lsll r2, r5, #30
+; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: asr.w r11, r5, #31
+; CHECK-NEXT: mov r12, r5
+; CHECK-NEXT: lsll r12, r11, r4
+; CHECK-NEXT: mul r2, r2, r9
+; CHECK-NEXT: lsrl r12, r11, #2
+; CHECK-NEXT: adds r2, #2
+; CHECK-NEXT: lsll r12, r11, r2
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT: asrl r6, r11, r0
-; CHECK-NEXT: movs r0, #2
-; CHECK-NEXT: lsrl r6, r11, #2
-; CHECK-NEXT: str r6, [r0]
-; CHECK-NEXT: ldr r0, [r8], #-4
-; CHECK-NEXT: mls r0, r0, r4, r1
-; CHECK-NEXT: adds.w r4, r0, #-2147483648
-; CHECK-NEXT: asr.w r1, r0, #31
+; CHECK-NEXT: add.w r5, r12, #-2147483648
+; CHECK-NEXT: asrl r10, r1, r5
+; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: lsrl r10, r1, #2
+; CHECK-NEXT: movs r1, #2
+; CHECK-NEXT: mov r9, r10
+; CHECK-NEXT: str.w r10, [r1]
+; CHECK-NEXT: ldr r1, [r8], #-4
+; CHECK-NEXT: mls r5, r1, r4, r5
+; CHECK-NEXT: adds.w r4, r5, #-2147483648
+; CHECK-NEXT: asr.w r1, r5, #31
; CHECK-NEXT: adc r1, r1, #0
; CHECK-NEXT: lsrl r4, r1, #2
-; CHECK-NEXT: rsbs r0, r4, #0
-; CHECK-NEXT: str r0, [r2]
-; CHECK-NEXT: str r0, [r9, #-4]
-; CHECK-NEXT: add.w r9, r9, #4
-; CHECK-NEXT: add.w r0, r12, #4
+; CHECK-NEXT: rsbs r1, r4, #0
+; CHECK-NEXT: str r1, [r2]
+; CHECK-NEXT: str r1, [r6, #-4]
+; CHECK-NEXT: adds r6, #4
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: .LBB2_3: @ %while.end
; CHECK-NEXT: add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index b60ee7c6d406b..0d86f22a321e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -1313,29 +1313,27 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: @ Child Loop BB16_3 Depth 2
; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: .LBB16_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vmov q0, q6
-; CHECK-NEXT: vadd.i32 q6, q5, r0
-; CHECK-NEXT: vmov r7, r3, d13
+; CHECK-NEXT: vadd.i32 q1, q5, r0
; CHECK-NEXT: vadd.i32 q2, q4, r0
-; CHECK-NEXT: vmov r5, r6, d5
-; CHECK-NEXT: vmov q1, q7
-; CHECK-NEXT: vmov r4, r10, d12
+; CHECK-NEXT: vmov r7, r3, d3
; CHECK-NEXT: vadd.i32 q6, q0, lr
+; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: subs.w r9, r9, #16
+; CHECK-NEXT: vmov r4, r10, d2
+; CHECK-NEXT: vadd.i32 q1, q7, lr
; CHECK-NEXT: vadd.i32 q4, q4, lr
; CHECK-NEXT: vadd.i32 q5, q5, lr
-; CHECK-NEXT: vadd.i32 q7, q7, lr
; CHECK-NEXT: ldrb.w r11, [r3]
; CHECK-NEXT: ldrb r3, [r7]
; CHECK-NEXT: vmov r7, r12, d4
-; CHECK-NEXT: vadd.i32 q2, q1, r0
-; CHECK-NEXT: vadd.i32 q1, q0, r0
+; CHECK-NEXT: vadd.i32 q2, q7, r0
+; CHECK-NEXT: vadd.i32 q7, q0, r0
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: ldrb r4, [r4]
@@ -1344,7 +1342,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: ldrb.w r1, [r12]
; CHECK-NEXT: vmov.8 q0[0], r7
; CHECK-NEXT: vmov.8 q0[1], r1
-; CHECK-NEXT: vmov r1, r7, d3
+; CHECK-NEXT: vmov r1, r7, d15
; CHECK-NEXT: vmov.8 q0[2], r5
; CHECK-NEXT: vmov.8 q0[3], r6
; CHECK-NEXT: vmov.8 q0[4], r4
@@ -1359,7 +1357,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: ldrb r3, [r5]
; CHECK-NEXT: ldrb.w r12, [r7]
; CHECK-NEXT: ldrb r5, [r4]
-; CHECK-NEXT: vmov r4, r7, d2
+; CHECK-NEXT: vmov r4, r7, d14
+; CHECK-NEXT: vmov q7, q1
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[8], r4
@@ -1371,6 +1370,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: vmov.8 q0[14], r3
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: vstrb.8 q0, [r8], #16
+; CHECK-NEXT: vmov q0, q6
; CHECK-NEXT: bne .LBB16_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index c0b2da7eff41b..eedca2cd4a5d3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: .LBB5_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2]
+; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT: vadd.i32 q3, q1, q0
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vadd.i32 q1, q1, q0
-; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2]
+; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: bne .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
@@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r4, .LCPI7_0
; CHECK-NEXT: mov.w r12, #9
-; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: mov.w lr, #12
; CHECK-NEXT: movs r4, #8
-; CHECK-NEXT: vdup.32 q1, r0
+; CHECK-NEXT: vdup.32 q0, r0
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vmla.i32 q3, q2, lr
-; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vadd.i32 q2, q1, r4
+; CHECK-NEXT: vmla.i32 q3, q1, lr
+; CHECK-NEXT: vmul.i32 q1, q1, r12
; CHECK-NEXT: vldrw.u32 q4, [q3, #24]
-; CHECK-NEXT: vmul.i32 q2, q2, r12
-; CHECK-NEXT: vadd.i32 q0, q0, r4
-; CHECK-NEXT: vstrw.32 q2, [r3]
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vstrw.32 q1, [r3]
+; CHECK-NEXT: vmov q1, q2
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %end
@@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: adr r4, .LCPI8_0
; CHECK-NEXT: movs r5, #18
-; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: mov.w r12, #9
; CHECK-NEXT: mov.w lr, #12
; CHECK-NEXT: movs r4, #8
-; CHECK-NEXT: vdup.32 q1, r0
-; CHECK-NEXT: vdup.32 q2, r5
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: vdup.32 q1, r5
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: vmla.i32 q4, q3, lr
+; CHECK-NEXT: vmov q4, q0
+; CHECK-NEXT: vadd.i32 q3, q2, r4
+; CHECK-NEXT: vmla.i32 q4, q2, lr
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldrw.u32 q5, [q4, #24]
-; CHECK-NEXT: vmov q4, q2
-; CHECK-NEXT: vmla.i32 q4, q3, r12
-; CHECK-NEXT: vadd.i32 q0, q0, r4
+; CHECK-NEXT: vmov q4, q1
+; CHECK-NEXT: vmla.i32 q4, q2, r12
+; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q5, [r1], #16
; CHECK-NEXT: vstrw.32 q4, [r3]
; CHECK-NEXT: bne .LBB8_1
@@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: vmov q7, q2
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: vmov.i32 q5, #0x0
-; CHECK-NEXT: vmlas.i32 q1, q0, r7
-; CHECK-NEXT: vmov q7, q4
+; CHECK-NEXT: vmlas.i32 q7, q0, r7
+; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vmov q6, q1
-; CHECK-NEXT: vadd.i32 q1, q1, q3
-; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2]
-; CHECK-NEXT: vldrw.u32 q6, [q7, #32]!
-; CHECK-NEXT: vmul.i32 q0, q0, q6
-; CHECK-NEXT: vadd.i32 q5, q0, q5
+; CHECK-NEXT: vadd.i32 q0, q7, q3
+; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2]
+; CHECK-NEXT: vldrw.u32 q7, [q6, #32]!
+; CHECK-NEXT: vmul.i32 q1, q1, q7
+; CHECK-NEXT: vmov q7, q0
+; CHECK-NEXT: vadd.i32 q5, q1, q5
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
@@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vadd.i32 q5, q5, q3
-; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1]
-; CHECK-NEXT: vldrh.s32 q6, [r3], #8
-; CHECK-NEXT: vmul.i32 q6, q7, q6
-; CHECK-NEXT: vadd.i32 q4, q6, q4
+; CHECK-NEXT: vadd.i32 q6, q5, q3
+; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
+; CHECK-NEXT: vldrh.s32 q5, [r3], #8
+; CHECK-NEXT: vmul.i32 q5, q7, q5
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
@@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
-; CHECK-NEXT: vmov q7, q5
-; CHECK-NEXT: vmov q6, q4
-; CHECK-NEXT: vldrb.s32 q2, [r0, q7]
-; CHECK-NEXT: vldrb.s32 q7, [r1, q6]
-; CHECK-NEXT: subs r5, #4
-; CHECK-NEXT: vadd.i32 q4, q4, q0
+; CHECK-NEXT: vldrb.s32 q2, [r0, q5]
+; CHECK-NEXT: vadd.i32 q7, q5, q0
+; CHECK-NEXT: vldrb.s32 q5, [r1, q4]
+; CHECK-NEXT: vadd.i32 q6, q4, q0
; CHECK-NEXT: vadd.i32 q2, q2, r2
-; CHECK-NEXT: vadd.i32 q5, q5, q0
-; CHECK-NEXT: vmlava.u32 r12, q2, q7
+; CHECK-NEXT: subs r5, #4
+; CHECK-NEXT: vmlava.u32 r12, q2, q5
+; CHECK-NEXT: vmov q5, q7
+; CHECK-NEXT: vmov q4, q6
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
index 652d25af02e7c..828f8e4f83048 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -105,66 +105,68 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #12
-; CHECK-NEXT: sub sp, #12
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: stm.w sp, {r0, r1, r3} @ 12-byte Folded Spill
+; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-NEXT: blt .LBB4_12
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: ldr r1, [sp, #48]
-; CHECK-NEXT: add.w r12, r2, #3
+; CHECK-NEXT: ldr r7, [sp, #44]
+; CHECK-NEXT: add.w r10, r2, #3
; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r8, r2
+; CHECK-NEXT: mov r9, r2
+; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: uxth r3, r1
+; CHECK-NEXT: uxth.w r12, r7
+; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: b .LBB4_4
; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: .LBB4_3: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: lsrs r2, r6, #16
-; CHECK-NEXT: sub.w r12, r12, #1
+; CHECK-NEXT: lsrs r0, r6, #16
+; CHECK-NEXT: sub.w r10, r10, #1
+; CHECK-NEXT: strh.w r0, [r5, r8, lsl #1]
+; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: add.w r11, r11, #2
-; CHECK-NEXT: sub.w r8, r8, #1
-; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1]
-; CHECK-NEXT: add.w r10, r10, #1
-; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: cmp r10, r2
-; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: sub.w r9, r9, #1
+; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: beq .LBB4_12
; CHECK-NEXT: .LBB4_4: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_8 Depth 2
; CHECK-NEXT: @ Child Loop BB4_11 Depth 2
-; CHECK-NEXT: cmp r2, r10
+; CHECK-NEXT: cmp r0, r8
; CHECK-NEXT: ble .LBB4_2
; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: sub.w r4, r2, r10
-; CHECK-NEXT: cmp r4, #8
+; CHECK-NEXT: sub.w r0, r0, r8
+; CHECK-NEXT: mov r2, r5
+; CHECK-NEXT: cmp r0, #8
; CHECK-NEXT: bhs .LBB4_7
; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: mov.w r9, #0
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: b .LBB4_10
; CHECK-NEXT: .LBB4_7: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: bic r2, r8, #7
-; CHECK-NEXT: movs r7, #1
-; CHECK-NEXT: subs r2, #8
-; CHECK-NEXT: bic r9, r4, #7
-; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: bic r7, r9, #7
+; CHECK-NEXT: movs r6, #1
+; CHECK-NEXT: subs r7, #8
+; CHECK-NEXT: bic r1, r0, #7
; CHECK-NEXT: mov r5, r11
-; CHECK-NEXT: add.w lr, r7, r2, lsr #3
-; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w lr, r6, r7, lsr #3
+; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload
; CHECK-NEXT: .LBB4_8: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldrh.u16 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q0, [r4], #16
; CHECK-NEXT: vldrh.u16 q1, [r5], #16
-; CHECK-NEXT: rsbs r7, r3, #0
+; CHECK-NEXT: rsb.w r7, r12, #0
; CHECK-NEXT: vmullb.s16 q2, q1, q0
; CHECK-NEXT: vmullt.s16 q0, q1, q0
; CHECK-NEXT: vshl.s32 q2, r7
@@ -174,29 +176,32 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
; CHECK-NEXT: le lr, .LBB4_8
; CHECK-NEXT: @ %bb.9: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: cmp r4, r9
+; CHECK-NEXT: mov r5, r2
+; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: beq .LBB4_3
; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: add.w r2, r9, r10
-; CHECK-NEXT: add.w r7, r1, r9, lsl #1
-; CHECK-NEXT: add.w r2, r1, r2, lsl #1
-; CHECK-NEXT: sub.w r5, r8, r9
-; CHECK-NEXT: dlstp.32 lr, r5
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r1, r8
+; CHECK-NEXT: sub.w r7, r9, r1
+; CHECK-NEXT: add.w r2, r0, r1, lsl #1
+; CHECK-NEXT: add.w r4, r0, r4, lsl #1
+; CHECK-NEXT: mov r1, r5
+; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: rsbs r4, r3, #0
-; CHECK-NEXT: vldrh.s32 q0, [r7], #8
-; CHECK-NEXT: vldrh.s32 q1, [r2], #8
+; CHECK-NEXT: rsb.w r0, r12, #0
+; CHECK-NEXT: vldrh.s32 q0, [r2], #8
+; CHECK-NEXT: vldrh.s32 q1, [r4], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshl.s32 q0, r4
+; CHECK-NEXT: vshl.s32 q0, r0
; CHECK-NEXT: vaddva.u32 r6, q0
; CHECK-NEXT: letp lr, .LBB4_11
; CHECK-NEXT: b .LBB4_3
; CHECK-NEXT: .LBB4_12: @ %for.end17
-; CHECK-NEXT: add sp, #12
+; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%conv = sext i16 %Ls to i32
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index d6c5cde30ed73..43ed5eefbf4c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: csel r7, r6, r5, hs
; CHECK-NEXT: add.w lr, r7, #1
; CHECK-NEXT: mov r4, r5
-; CHECK-NEXT: vldrh.u16 q1, [r0], #32
+; CHECK-NEXT: vldrh.u16 q0, [r0], #32
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mov r8, r5
-; CHECK-NEXT: vldrh.u16 q2, [r1], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2
-; CHECK-NEXT: vldrh.u16 q0, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2
-; CHECK-NEXT: vldrh.u16 q2, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2
; CHECK-NEXT: vldrh.u16 q1, [r1], #32
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1
+; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
+; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vldrh.u16 q0, [r1], #32
; CHECK-NEXT: sub.w lr, lr, #1
; CHECK-NEXT: cmp.w lr, #0
-; CHECK-NEXT: vldrh.u16 q3, [r0], #32
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2
-; CHECK-NEXT: vldrh.u16 q2, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1
-; CHECK-NEXT: vldrh.u16 q0, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1
-; CHECK-NEXT: vldrh.u16 q3, [r0], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2
-; CHECK-NEXT: vldrh.u16 q1, [r1], #32
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
+; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vldrh.u16 q0, [r1], #32
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
; CHECK-NEXT: movs r6, #14
; CHECK-NEXT: and.w r2, r6, r2, lsl #1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1
-; CHECK-NEXT: vldrh.u16 q0, [r0, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1
-; CHECK-NEXT: vldrh.u16 q1, [r1, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vldrh.u16 q0, [r1, #-16]
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0
; CHECK-NEXT: vctp.16 r2
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrht.u16 q2, [r0]
+; CHECK-NEXT: vldrht.u16 q1, [r0]
; CHECK-NEXT: cmp r2, #9
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrht.u16 q0, [r1]
-; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0
-; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0
+; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0
+; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0
; CHECK-NEXT: blo .LBB0_10
; CHECK-NEXT: @ %bb.4: @ %do.body.1
; CHECK-NEXT: subs r2, #8
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
index 9e4faa96dbf26..75612ba645ca4 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -15,15 +15,16 @@ define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label0:
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: local.tee 3
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
-; CHECK-NEXT: local.tee 0
-; CHECK-NEXT: local.get 3
+; CHECK-NEXT: local.tee 3
+; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load 0:p2align=0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shl
; CHECK-NEXT: v128.store 0
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: local.set 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i32.const -1
; CHECK-NEXT: i32.add
@@ -63,11 +64,10 @@ define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: loop # label1:
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: local.tee 3
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
-; CHECK-NEXT: local.tee 0
-; CHECK-NEXT: local.get 3
+; CHECK-NEXT: local.tee 3
+; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load 0:p2align=0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shl
@@ -76,6 +76,8 @@ define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
; CHECK-NEXT: i32.const 1
; CHECK-NEXT: i32.and
; CHECK-NEXT: local.set 1
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: local.set 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i32.const -1
; CHECK-NEXT: i32.add
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 06cf968512db8..8a8e7a3b4df2c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -297,30 +297,30 @@ define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movl $buf, %ecx
-; CHECK-NEXT: movl $32, %edx
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movl $buf, %edx
+; CHECK-NEXT: movl $32, %esi
; CHECK-NEXT: jmp .LBB5_1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_3: # %if.false
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: decl %esi
+; CHECK-NEXT: decl %eax
; CHECK-NEXT: .LBB5_4: # %loop.bb2
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: leal (%rdi,%rsi), %r8d
+; CHECK-NEXT: leal (%rdi,%rax), %r8d
; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: cmpw $7, %si
+; CHECK-NEXT: cmpw $7, %ax
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx)
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
; CHECK-NEXT: jne .LBB5_5
; CHECK-NEXT: .LBB5_1: # %loop.bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne .LBB5_3
; CHECK-NEXT: # %bb.2: # %if.true
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: incl %esi
+; CHECK-NEXT: incl %eax
; CHECK-NEXT: jmp .LBB5_4
; CHECK-NEXT: .LBB5_5: # %exit
; CHECK-NEXT: tilerelease
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 477a0dce5c81c..cffd88c55bb0a 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -111,63 +111,62 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X86-NOBMI-NEXT: orl %ecx, %eax
; X86-NOBMI-NEXT: je .LBB1_3
; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader
-; X86-NOBMI-NEXT: xorl %esi, %esi
+; X86-NOBMI-NEXT: xorl %eax, %eax
+; X86-NOBMI-NEXT: xorl %edx, %edx
; X86-NOBMI-NEXT: xorl %ecx, %ecx
-; X86-NOBMI-NEXT: xorl %edi, %edi
-; X86-NOBMI-NEXT: xorl %ebp, %ebp
+; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill
; X86-NOBMI-NEXT: .p2align 4
; X86-NOBMI-NEXT: .LBB1_2: # %for.body
; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NOBMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT: movl (%eax,%edi,8), %ebp
-; X86-NOBMI-NEXT: movl 4(%eax,%edi,8), %ebx
+; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi
+; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx
; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT: movl %ebp, %eax
-; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %edi, %eax
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: mull %esi
+; X86-NOBMI-NEXT: movl %edx, %ebp
; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NOBMI-NEXT: movl %ebx, %eax
-; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT: movl %eax, %ebx
-; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NOBMI-NEXT: adcl $0, %edx
-; X86-NOBMI-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NOBMI-NEXT: movl %ebp, %eax
+; X86-NOBMI-NEXT: mull %esi
+; X86-NOBMI-NEXT: movl %edx, %ebx
+; X86-NOBMI-NEXT: movl %eax, %esi
+; X86-NOBMI-NEXT: addl %ebp, %esi
+; X86-NOBMI-NEXT: adcl $0, %ebx
+; X86-NOBMI-NEXT: movl %edi, %eax
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOBMI-NEXT: mull %edx
-; X86-NOBMI-NEXT: movl %eax, %ebp
-; X86-NOBMI-NEXT: addl %ebx, %ebp
-; X86-NOBMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NOBMI-NEXT: movl %edx, %ebx
-; X86-NOBMI-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NOBMI-NEXT: movl %edx, %ebp
+; X86-NOBMI-NEXT: movl %eax, %edi
+; X86-NOBMI-NEXT: addl %esi, %edi
+; X86-NOBMI-NEXT: adcl %ebx, %ebp
+; X86-NOBMI-NEXT: setb %bl
; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT: movl %eax, %esi
-; X86-NOBMI-NEXT: addl %ebx, %esi
-; X86-NOBMI-NEXT: movl %ecx, %eax
-; X86-NOBMI-NEXT: movzbl (%esp), %ebx # 1-byte Folded Reload
-; X86-NOBMI-NEXT: movl %edx, %ecx
-; X86-NOBMI-NEXT: adcl %ebx, %ecx
-; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NOBMI-NEXT: adcl %eax, %ebp
-; X86-NOBMI-NEXT: adcl $0, %esi
-; X86-NOBMI-NEXT: adcl $0, %ecx
-; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT: movl %edx, (%eax,%edi,8)
-; X86-NOBMI-NEXT: movl %ebp, 4(%eax,%edi,8)
-; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT: addl $1, %edi
-; X86-NOBMI-NEXT: adcl $0, %ebp
-; X86-NOBMI-NEXT: movl %edi, %eax
-; X86-NOBMI-NEXT: xorl %edx, %eax
-; X86-NOBMI-NEXT: movl %ebp, %edx
-; X86-NOBMI-NEXT: xorl %ebx, %edx
-; X86-NOBMI-NEXT: orl %eax, %edx
+; X86-NOBMI-NEXT: addl %ebp, %eax
+; X86-NOBMI-NEXT: movzbl %bl, %esi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NOBMI-NEXT: adcl %esi, %edx
+; X86-NOBMI-NEXT: movl %ecx, %ebx
+; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NOBMI-NEXT: adcl $0, %eax
+; X86-NOBMI-NEXT: adcl $0, %edx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8)
+; X86-NOBMI-NEXT: movl %ebx, %ecx
+; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8)
+; X86-NOBMI-NEXT: addl $1, %ecx
+; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NOBMI-NEXT: adcl $0, %edi
+; X86-NOBMI-NEXT: movl %ecx, %esi
+; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NOBMI-NEXT: xorl %ebp, %edi
+; X86-NOBMI-NEXT: orl %esi, %edi
; X86-NOBMI-NEXT: jne .LBB1_2
; X86-NOBMI-NEXT: .LBB1_3: # %for.end
; X86-NOBMI-NEXT: xorl %eax, %eax
@@ -185,66 +184,71 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X86-BMI-NEXT: pushl %ebx
; X86-BMI-NEXT: pushl %edi
; X86-BMI-NEXT: pushl %esi
-; X86-BMI-NEXT: subl $16, %esp
+; X86-BMI-NEXT: subl $20, %esp
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-BMI-NEXT: orl %ecx, %eax
; X86-BMI-NEXT: je .LBB1_3
; X86-BMI-NEXT: # %bb.1: # %for.body.preheader
-; X86-BMI-NEXT: xorl %esi, %esi
-; X86-BMI-NEXT: xorl %edi, %edi
+; X86-BMI-NEXT: xorl %ecx, %ecx
+; X86-BMI-NEXT: xorl %eax, %eax
; X86-BMI-NEXT: xorl %ebx, %ebx
-; X86-BMI-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-BMI-NEXT: xorl %ebp, %ebp
; X86-BMI-NEXT: .p2align 4
; X86-BMI-NEXT: .LBB1_2: # %for.body
; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx
-; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %ebp
-; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi
+; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-BMI-NEXT: movl %ecx, %edx
-; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edx, %eax
-; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: mulxl %eax, %edx, %edi
; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT: movl %ebp, %edx
-; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %ebp
-; X86-BMI-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; X86-BMI-NEXT: adcl $0, %ebp
+; X86-BMI-NEXT: movl %esi, %edx
+; X86-BMI-NEXT: mulxl %eax, %esi, %eax
+; X86-BMI-NEXT: addl %edi, %esi
+; X86-BMI-NEXT: adcl $0, %eax
; X86-BMI-NEXT: movl %ecx, %edx
-; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx
-; X86-BMI-NEXT: addl %eax, %ecx
-; X86-BMI-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-BMI-NEXT: movl %esi, %eax
-; X86-BMI-NEXT: adcl %ebp, %edx
-; X86-BMI-NEXT: movl %edx, %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp
+; X86-BMI-NEXT: addl %esi, %edi
+; X86-BMI-NEXT: adcl %eax, %ebp
; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %esi, %edi
+; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax
; X86-BMI-NEXT: setb %dl
-; X86-BMI-NEXT: addl %ebp, %esi
+; X86-BMI-NEXT: addl %ebp, %ecx
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movzbl %dl, %edx
-; X86-BMI-NEXT: adcl %edx, %edi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-BMI-NEXT: addl %eax, %edx
-; X86-BMI-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
-; X86-BMI-NEXT: adcl $0, %esi
-; X86-BMI-NEXT: adcl $0, %edi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl %edx, (%eax,%ebx,8)
-; X86-BMI-NEXT: movl %ecx, 4(%eax,%ebx,8)
-; X86-BMI-NEXT: addl $1, %ebx
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT: adcl %edx, %eax
+; X86-BMI-NEXT: movl %eax, %edx
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
; X86-BMI-NEXT: adcl $0, %ecx
-; X86-BMI-NEXT: movl %ebx, %eax
-; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT: xorl %ebp, %ecx
-; X86-BMI-NEXT: orl %eax, %ecx
+; X86-BMI-NEXT: adcl $0, %edx
+; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8)
+; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: addl $1, %ebx
+; X86-BMI-NEXT: adcl $0, %ebp
+; X86-BMI-NEXT: movl %ebx, %edx
+; X86-BMI-NEXT: xorl %esi, %edx
+; X86-BMI-NEXT: movl %ebp, %esi
+; X86-BMI-NEXT: xorl %edi, %esi
+; X86-BMI-NEXT: orl %edx, %esi
+; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-BMI-NEXT: jne .LBB1_2
; X86-BMI-NEXT: .LBB1_3: # %for.end
; X86-BMI-NEXT: xorl %eax, %eax
; X86-BMI-NEXT: xorl %edx, %edx
-; X86-BMI-NEXT: addl $16, %esp
+; X86-BMI-NEXT: addl $20, %esp
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: popl %edi
; X86-BMI-NEXT: popl %ebx
@@ -257,12 +261,11 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X64-NOBMI-NEXT: je .LBB1_3
; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader
; X64-NOBMI-NEXT: movq %rdx, %r8
-; X64-NOBMI-NEXT: xorl %edx, %edx
+; X64-NOBMI-NEXT: xorl %r10d, %r10d
; X64-NOBMI-NEXT: xorl %r9d, %r9d
; X64-NOBMI-NEXT: .p2align 4
; X64-NOBMI-NEXT: .LBB1_2: # %for.body
; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NOBMI-NEXT: movq %rdx, %r10
; X64-NOBMI-NEXT: movq %rcx, %rax
; X64-NOBMI-NEXT: mulq (%r8,%r9,8)
; X64-NOBMI-NEXT: addq %r10, %rax
@@ -270,6 +273,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8)
; X64-NOBMI-NEXT: incq %r9
; X64-NOBMI-NEXT: cmpq %r9, %rdi
+; X64-NOBMI-NEXT: movq %rdx, %r10
; X64-NOBMI-NEXT: jne .LBB1_2
; X64-NOBMI-NEXT: .LBB1_3: # %for.end
; X64-NOBMI-NEXT: xorl %eax, %eax
@@ -281,12 +285,11 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X64-BMI-NEXT: je .LBB1_3
; X64-BMI-NEXT: # %bb.1: # %for.body.preheader
; X64-BMI-NEXT: movq %rdx, %rax
-; X64-BMI-NEXT: xorl %edx, %edx
+; X64-BMI-NEXT: xorl %r9d, %r9d
; X64-BMI-NEXT: xorl %r8d, %r8d
; X64-BMI-NEXT: .p2align 4
; X64-BMI-NEXT: .LBB1_2: # %for.body
; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-BMI-NEXT: movq %rdx, %r9
; X64-BMI-NEXT: movq %rcx, %rdx
; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx
; X64-BMI-NEXT: addq %r9, %r10
@@ -294,6 +297,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8)
; X64-BMI-NEXT: incq %r8
; X64-BMI-NEXT: cmpq %r8, %rdi
+; X64-BMI-NEXT: movq %rdx, %r9
; X64-BMI-NEXT: jne .LBB1_2
; X64-BMI-NEXT: .LBB1_3: # %for.end
; X64-BMI-NEXT: xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
index ebae51fa2aa46..08003739b55d0 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
@@ -16,11 +16,11 @@ define void @foo(i32 %N) nounwind {
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # %bb
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movw %dx, X
-; CHECK-NEXT: movw %cx, Y
-; CHECK-NEXT: incl %edx
-; CHECK-NEXT: addl $4, %ecx
-; CHECK-NEXT: cmpl %edx, %eax
+; CHECK-NEXT: movw %cx, X
+; CHECK-NEXT: movw %dx, Y
+; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: addl $4, %edx
+; CHECK-NEXT: cmpl %ecx, %eax
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: .LBB0_3: # %return
; CHECK-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 2a2a4a5ca18d3..209ee79d51419 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1480,15 +1480,15 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB10_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1728,10 +1728,10 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4
; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3
@@ -1739,9 +1739,9 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB11_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1765,15 +1765,15 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2
-; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2
-; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512-NEXT: addq $16, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB11_1
; AVX512-NEXT: # %bb.2: # %middle.block
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll
index 173c41140ebef..1a7551f6117e8 100644
--- a/llvm/test/CodeGen/X86/pr49451.ll
+++ b/llvm/test/CodeGen/X86/pr49451.ll
@@ -18,15 +18,15 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB0_1: # %for.body612
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: testb %dl, %dl
+; X86-NEXT: testb %bl, %bl
; X86-NEXT: je .LBB0_2
; X86-NEXT: # %bb.3: # %if.end1401
; X86-NEXT: # in Loop: Header=BB0_1 Depth=1
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movw %si, s_2
-; X86-NEXT: movw %bx, s_0
+; X86-NEXT: movw %dx, s_0
; X86-NEXT: incl %ecx
-; X86-NEXT: incl %ebx
+; X86-NEXT: incl %edx
; X86-NEXT: cmpw $73, %cx
; X86-NEXT: jl .LBB0_1
; X86-NEXT: # %bb.4: # %for.body1703
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 1c3d27fac4203..c0962236f93dd 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -12729,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK9-NEXT: pushq %rbx
; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK9-NEXT: movl (%rsi), %edi
+; FALLBACK9-NEXT: movl (%rsi), %eax
; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: leal (,%rdi,8), %ecx
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
; FALLBACK9-NEXT: andl $56, %ecx
-; FALLBACK9-NEXT: andl $56, %edi
-; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi
-; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9
-; FALLBACK9-NEXT: movq %r9, %rax
-; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax
-; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
; FALLBACK9-NEXT: movq %r10, %r8
; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9
-; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
; FALLBACK9-NEXT: movq %r11, %rbx
; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi
-; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14
-; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi
-; FALLBACK9-NEXT: movq %rdi, %r15
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r15
; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT: shrq %cl, %r11
; FALLBACK9-NEXT: movq %r15, 8(%rdx)
; FALLBACK9-NEXT: movq %r9, 48(%rdx)
; FALLBACK9-NEXT: movq %r11, 56(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rax, 24(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
; FALLBACK9-NEXT: movq %r14, (%rdx)
; FALLBACK9-NEXT: popq %rbx
; FALLBACK9-NEXT: popq %r14
@@ -12906,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK12-NEXT: pushq %rbx
; FALLBACK12-NEXT: pushq %rax
; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK12-NEXT: movl (%rsi), %r10d
+; FALLBACK12-NEXT: movl (%rsi), %r9d
; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: leal (,%r10,8), %eax
+; FALLBACK12-NEXT: leal (,%r9,8), %eax
; FALLBACK12-NEXT: andl $56, %eax
-; FALLBACK12-NEXT: andl $56, %r10d
-; FALLBACK12-NEXT: movq -128(%rsp,%r10), %r9
-; FALLBACK12-NEXT: movq -120(%rsp,%r10), %r8
+; FALLBACK12-NEXT: andl $56, %r9d
+; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8
; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: shrq %cl, %r10
; FALLBACK12-NEXT: movl %eax, %esi
; FALLBACK12-NEXT: notb %sil
; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
; FALLBACK12-NEXT: movl %esi, %ecx
; FALLBACK12-NEXT: shlq %cl, %rdi
-; FALLBACK12-NEXT: orq %r9, %rdi
-; FALLBACK12-NEXT: movq -104(%rsp,%r10), %r9
-; FALLBACK12-NEXT: movq %r9, %rbx
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %rbx
; FALLBACK12-NEXT: movl %eax, %ecx
; FALLBACK12-NEXT: shrq %cl, %rbx
-; FALLBACK12-NEXT: movq -96(%rsp,%r10), %r12
+; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12
; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
; FALLBACK12-NEXT: movl %esi, %ecx
; FALLBACK12-NEXT: shlq %cl, %r11
; FALLBACK12-NEXT: orq %rbx, %r11
-; FALLBACK12-NEXT: movq -112(%rsp,%r10), %rbx
+; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx
; FALLBACK12-NEXT: movq %rbx, %r14
; FALLBACK12-NEXT: movl %eax, %ecx
; FALLBACK12-NEXT: shrq %cl, %r14
-; FALLBACK12-NEXT: addq %r9, %r9
+; FALLBACK12-NEXT: addq %r10, %r10
; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r9
-; FALLBACK12-NEXT: orq %r14, %r9
-; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r14
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14
; FALLBACK12-NEXT: movq %r14, %r13
; FALLBACK12-NEXT: movl %eax, %ecx
; FALLBACK12-NEXT: shrq %cl, %r13
-; FALLBACK12-NEXT: movq -80(%rsp,%r10), %rbp
+; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp
; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
; FALLBACK12-NEXT: movl %esi, %ecx
; FALLBACK12-NEXT: shlq %cl, %r15
@@ -12957,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK12-NEXT: orq %r12, %r14
; FALLBACK12-NEXT: movl %eax, %ecx
; FALLBACK12-NEXT: shrq %cl, %rbp
-; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r10
-; FALLBACK12-NEXT: leaq (%r10,%r10), %r12
+; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %r12
; FALLBACK12-NEXT: movl %esi, %ecx
; FALLBACK12-NEXT: shlq %cl, %r12
; FALLBACK12-NEXT: orq %rbp, %r12
@@ -12969,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK12-NEXT: shlq %cl, %rbx
; FALLBACK12-NEXT: orq %r8, %rbx
; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movq %r10, 56(%rdx)
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 56(%rdx)
; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
; FALLBACK12-NEXT: movq %r12, 48(%rdx)
; FALLBACK12-NEXT: movq %r14, 32(%rdx)
; FALLBACK12-NEXT: movq %r15, 40(%rdx)
-; FALLBACK12-NEXT: movq %r9, 16(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
; FALLBACK12-NEXT: movq %r11, 24(%rdx)
; FALLBACK12-NEXT: movq %rdi, (%rdx)
; FALLBACK12-NEXT: addq $8, %rsp
@@ -13111,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK15-NEXT: pushq %r14
; FALLBACK15-NEXT: pushq %rbx
; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK15-NEXT: movl (%rsi), %edi
+; FALLBACK15-NEXT: movl (%rsi), %eax
; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: leal (,%rdi,8), %ecx
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
; FALLBACK15-NEXT: andl $56, %ecx
-; FALLBACK15-NEXT: andl $56, %edi
-; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi
-; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9
-; FALLBACK15-NEXT: movq %r9, %rax
-; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax
-; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
; FALLBACK15-NEXT: movq %r10, %r8
; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9
-; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
; FALLBACK15-NEXT: movq %r11, %rbx
; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi
-; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11
+; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14
-; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi
-; FALLBACK15-NEXT: movq %rdi, %r15
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r15
; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10
; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
; FALLBACK15-NEXT: movq %r15, 8(%rdx)
; FALLBACK15-NEXT: movq %r9, 48(%rdx)
-; FALLBACK15-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
; FALLBACK15-NEXT: movq %r14, (%rdx)
; FALLBACK15-NEXT: movq %r10, 56(%rdx)
; FALLBACK15-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index 9fbbba2ed3b47..37620ecf8c1b8 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
; ENABLE-NEXT: .p2align 4
; ENABLE-NEXT: LBB14_2: ## %for.body
; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1
-; ENABLE-NEXT: movl %esi, %eax
+; ENABLE-NEXT: cmpl %esi, %edi
+; ENABLE-NEXT: setl %al
; ENABLE-NEXT: xorl %esi, %esi
-; ENABLE-NEXT: cmpl %eax, %edi
-; ENABLE-NEXT: setl %sil
+; ENABLE-NEXT: movb %al, %sil
; ENABLE-NEXT: incb %dl
; ENABLE-NEXT: cmpb $45, %dl
; ENABLE-NEXT: jl LBB14_2
@@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
; DISABLE-NEXT: .p2align 4
; DISABLE-NEXT: LBB14_2: ## %for.body
; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1
-; DISABLE-NEXT: movl %esi, %eax
+; DISABLE-NEXT: cmpl %esi, %edi
+; DISABLE-NEXT: setl %al
; DISABLE-NEXT: xorl %esi, %esi
-; DISABLE-NEXT: cmpl %eax, %edi
-; DISABLE-NEXT: setl %sil
+; DISABLE-NEXT: movb %al, %sil
; DISABLE-NEXT: incb %dl
; DISABLE-NEXT: cmpb $45, %dl
; DISABLE-NEXT: jl LBB14_2
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index 59fbf7183abc6..2bef66825d8c0 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB3_1: # %bb
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, %edx
; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: jne .LBB3_1
; X86-NEXT: # %bb.2: # %bb12
; X86-NEXT: retl
@@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind {
; X64-LIN-NEXT: .p2align 4
; X64-LIN-NEXT: .LBB3_1: # %bb
; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT: movl %esi, %ecx
; X64-LIN-NEXT: xorl %esi, %eax
-; X64-LIN-NEXT: movl %eax, %esi
-; X64-LIN-NEXT: notl %esi
-; X64-LIN-NEXT: andl %ecx, %esi
-; X64-LIN-NEXT: addl %esi, %esi
+; X64-LIN-NEXT: movl %eax, %ecx
+; X64-LIN-NEXT: notl %ecx
+; X64-LIN-NEXT: andl %esi, %ecx
+; X64-LIN-NEXT: addl %ecx, %ecx
+; X64-LIN-NEXT: movl %ecx, %esi
; X64-LIN-NEXT: jne .LBB3_1
; X64-LIN-NEXT: # %bb.2: # %bb12
; X64-LIN-NEXT: retq
@@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind {
; X64-WIN-NEXT: .p2align 4
; X64-WIN-NEXT: .LBB3_1: # %bb
; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT: movl %edx, %ecx
; X64-WIN-NEXT: xorl %edx, %eax
-; X64-WIN-NEXT: movl %eax, %edx
-; X64-WIN-NEXT: notl %edx
-; X64-WIN-NEXT: andl %ecx, %edx
-; X64-WIN-NEXT: addl %edx, %edx
+; X64-WIN-NEXT: movl %eax, %ecx
+; X64-WIN-NEXT: notl %ecx
+; X64-WIN-NEXT: andl %edx, %ecx
+; X64-WIN-NEXT: addl %ecx, %ecx
+; X64-WIN-NEXT: movl %ecx, %edx
; X64-WIN-NEXT: jne .LBB3_1
; X64-WIN-NEXT: # %bb.2: # %bb12
; X64-WIN-NEXT: retq
@@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB4_1: # %bb
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: xorl %edx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: testw %cx, %cx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: testw %dx, %dx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: jne .LBB4_1
; X86-NEXT: # %bb.2: # %bb12
; X86-NEXT: # kill: def $ax killed $ax killed $eax
@@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind {
; X64-LIN-NEXT: .p2align 4
; X64-LIN-NEXT: .LBB4_1: # %bb
; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT: movl %esi, %ecx
-; X64-LIN-NEXT: xorl %ecx, %eax
-; X64-LIN-NEXT: movl %eax, %esi
-; X64-LIN-NEXT: notl %esi
-; X64-LIN-NEXT: andl %ecx, %esi
-; X64-LIN-NEXT: addl %esi, %esi
-; X64-LIN-NEXT: testw %si, %si
+; X64-LIN-NEXT: xorl %esi, %eax
+; X64-LIN-NEXT: movl %eax, %ecx
+; X64-LIN-NEXT: notl %ecx
+; X64-LIN-NEXT: andl %esi, %ecx
+; X64-LIN-NEXT: addl %ecx, %ecx
+; X64-LIN-NEXT: testw %cx, %cx
+; X64-LIN-NEXT: movl %ecx, %esi
; X64-LIN-NEXT: jne .LBB4_1
; X64-LIN-NEXT: # %bb.2: # %bb12
; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax
@@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind {
; X64-WIN-NEXT: .p2align 4
; X64-WIN-NEXT: .LBB4_1: # %bb
; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT: movl %edx, %ecx
-; X64-WIN-NEXT: xorl %ecx, %eax
-; X64-WIN-NEXT: movl %eax, %edx
-; X64-WIN-NEXT: notl %edx
-; X64-WIN-NEXT: andl %ecx, %edx
-; X64-WIN-NEXT: addl %edx, %edx
-; X64-WIN-NEXT: testw %dx, %dx
+; X64-WIN-NEXT: xorl %edx, %eax
+; X64-WIN-NEXT: movl %eax, %ecx
+; X64-WIN-NEXT: notl %ecx
+; X64-WIN-NEXT: andl %edx, %ecx
+; X64-WIN-NEXT: addl %ecx, %ecx
+; X64-WIN-NEXT: testw %cx, %cx
+; X64-WIN-NEXT: movl %ecx, %edx
; X64-WIN-NEXT: jne .LBB4_1
; X64-WIN-NEXT: # %bb.2: # %bb12
; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax
@@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB5_1: # %bb
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, %edx
; X86-NEXT: xorb %cl, %al
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: notb %cl
-; X86-NEXT: andb %dl, %cl
-; X86-NEXT: addb %cl, %cl
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: notb %dl
+; X86-NEXT: andb %cl, %dl
+; X86-NEXT: addb %dl, %dl
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: jne .LBB5_1
; X86-NEXT: # %bb.2: # %bb12
; X86-NEXT: retl
@@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind {
; X64-LIN-NEXT: .p2align 4
; X64-LIN-NEXT: .LBB5_1: # %bb
; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT: movl %esi, %ecx
; X64-LIN-NEXT: xorb %sil, %al
-; X64-LIN-NEXT: movl %eax, %esi
-; X64-LIN-NEXT: notb %sil
-; X64-LIN-NEXT: andb %cl, %sil
-; X64-LIN-NEXT: addb %sil, %sil
+; X64-LIN-NEXT: movl %eax, %ecx
+; X64-LIN-NEXT: notb %cl
+; X64-LIN-NEXT: andb %sil, %cl
+; X64-LIN-NEXT: addb %cl, %cl
+; X64-LIN-NEXT: movl %ecx, %esi
; X64-LIN-NEXT: jne .LBB5_1
; X64-LIN-NEXT: # %bb.2: # %bb12
; X64-LIN-NEXT: # kill: def $al killed $al killed $eax
@@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind {
; X64-WIN-NEXT: .p2align 4
; X64-WIN-NEXT: .LBB5_1: # %bb
; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT: movl %edx, %ecx
; X64-WIN-NEXT: xorb %dl, %al
-; X64-WIN-NEXT: movl %eax, %edx
-; X64-WIN-NEXT: notb %dl
-; X64-WIN-NEXT: andb %cl, %dl
-; X64-WIN-NEXT: addb %dl, %dl
+; X64-WIN-NEXT: movl %eax, %ecx
+; X64-WIN-NEXT: notb %cl
+; X64-WIN-NEXT: andb %dl, %cl
+; X64-WIN-NEXT: addb %cl, %cl
+; X64-WIN-NEXT: movl %ecx, %edx
; X64-WIN-NEXT: jne .LBB5_1
; X64-WIN-NEXT: # %bb.2: # %bb12
; X64-WIN-NEXT: retq
@@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB6_1: # %bb
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, %edx
; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: jne .LBB6_1
; X86-NEXT: # %bb.2: # %bb12
; X86-NEXT: retl
@@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind {
; X64-LIN-NEXT: .p2align 4
; X64-LIN-NEXT: .LBB6_1: # %bb
; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT: movl %esi, %ecx
; X64-LIN-NEXT: xorl %esi, %eax
-; X64-LIN-NEXT: movl %eax, %esi
-; X64-LIN-NEXT: xorl $2147483646, %esi # imm = 0x7FFFFFFE
-; X64-LIN-NEXT: andl %ecx, %esi
-; X64-LIN-NEXT: addl %esi, %esi
+; X64-LIN-NEXT: movl %eax, %ecx
+; X64-LIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE
+; X64-LIN-NEXT: andl %esi, %ecx
+; X64-LIN-NEXT: addl %ecx, %ecx
+; X64-LIN-NEXT: movl %ecx, %esi
; X64-LIN-NEXT: jne .LBB6_1
; X64-LIN-NEXT: # %bb.2: # %bb12
; X64-LIN-NEXT: retq
@@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind {
; X64-WIN-NEXT: .p2align 4
; X64-WIN-NEXT: .LBB6_1: # %bb
; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT: movl %edx, %ecx
; X64-WIN-NEXT: xorl %edx, %eax
-; X64-WIN-NEXT: movl %eax, %edx
-; X64-WIN-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE
-; X64-WIN-NEXT: andl %ecx, %edx
-; X64-WIN-NEXT: addl %edx, %edx
+; X64-WIN-NEXT: movl %eax, %ecx
+; X64-WIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE
+; X64-WIN-NEXT: andl %edx, %ecx
+; X64-WIN-NEXT: addl %ecx, %ecx
+; X64-WIN-NEXT: movl %ecx, %edx
; X64-WIN-NEXT: jne .LBB6_1
; X64-WIN-NEXT: # %bb.2: # %bb12
; X64-WIN-NEXT: retq
More information about the llvm-commits
mailing list