[llvm-branch-commits] [llvm] AMDGPU: Really use AV classes by default for vector classes (PR #166483)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 10 10:59:42 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/166483
>From ed81a1e99a2e3a2c3dceb7b0b54bcf262c36d1ad Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 18 Sep 2025 15:52:48 +0900
Subject: [PATCH 1/2] AMDGPU: Really use AV classes by default for vector
classes
Update getRegClassFor to use AV classes in place of VGPRs for
gfx90a-gfx950. There are a handful of regressions. Most are
enabling unprofitable rematerialization which reduce register
count by 1 but add an unnecessary instruction.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +-
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 8 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 +
.../branch-folding-implicit-def-subreg.ll | 467 ++++----
.../AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll | 16 +-
.../AMDGPU/buffer-fat-pointers-memcpy.ll | 182 ++-
.../callee-special-input-vgprs-packed.ll | 12 +-
.../CodeGen/AMDGPU/dag-divergence-atomic.ll | 12 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 772 ++++++------
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 1044 ++++++++--------
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 1044 ++++++++--------
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 876 +++++++-------
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 84 +-
.../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 31 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 800 ++++++-------
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 1048 ++++++++---------
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 1048 ++++++++---------
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 864 +++++++-------
.../CodeGen/AMDGPU/global-i16-load-store.ll | 12 +-
.../insert_waitcnt_for_precise_memory.ll | 4 +-
.../AMDGPU/lds-dma-workgroup-release.ll | 24 +-
.../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll | 8 +-
.../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 178 +--
...vm.amdgcn.sched.group.barrier.iterative.ll | 18 +-
.../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 620 +++++-----
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 126 +-
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 24 +-
...uffer-fat-pointers-nontemporal-metadata.ll | 12 +-
.../CodeGen/AMDGPU/masked-load-vectortypes.ll | 3 +-
llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll | 12 +-
.../AMDGPU/undef-handling-crash-in-ra.ll | 51 +-
31 files changed, 4694 insertions(+), 4715 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 98fe923147ccc..7a9a3daa4033a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18768,8 +18768,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
: &AMDGPU::SReg_32RegClass;
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
- if (TRI->isSGPRClass(RC) && isDivergent)
+ if (TRI->isSGPRClass(RC) && isDivergent) {
+ if (Subtarget->hasGFX90AInsts())
+ return TRI->getEquivalentAVClass(RC);
return TRI->getEquivalentVGPRClass(RC);
+ }
return RC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3f52e8229ac08..bbae1c976ae1d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3638,6 +3638,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
return ARC;
}
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const {
+ unsigned Size = getRegSizeInBits(*SRC);
+ const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size);
+ assert(ARC && "Invalid register class size");
+ return ARC;
+}
+
const TargetRegisterClass *
SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
unsigned Size = getRegSizeInBits(*VRC);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 6e119e5e7c194..e2fe991340494 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -289,6 +289,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
+ /// \returns An AGPR+VGPR super reg class with the same width as \p SRC
+ const TargetRegisterClass *
+ getEquivalentAVClass(const TargetRegisterClass *SRC) const;
+
/// \returns A SGPR reg class with the same width as \p SRC
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index d89b39348ad9a..5c526c78afcd7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -43,31 +43,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: successors: %bb.3(0x80000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
+ ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF implicit-def $vgpr16
; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
- ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24
; GFX90A-NEXT: renamable $vgpr27 = IMPLICIT_DEF implicit-def $vgpr26
; GFX90A-NEXT: renamable $vgpr29 = IMPLICIT_DEF implicit-def $vgpr28
+ ; GFX90A-NEXT: renamable $vgpr33 = IMPLICIT_DEF implicit-def $vgpr32
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr6 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.4.bb15:
; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr1, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr6, implicit $exec
; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -75,7 +75,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.5:
; GFX90A-NEXT: successors: %bb.6(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0
@@ -88,9 +88,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF
@@ -98,32 +98,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.6.Flow20:
; GFX90A-NEXT: successors: %bb.7(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr26 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr28 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr18, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr33 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr18, implicit $exec
; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.7.Flow19:
; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0
; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -131,7 +131,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.8.Flow32:
; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -140,15 +140,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.9.bb89:
; GFX90A-NEXT: successors: %bb.10(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
- ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.10.Flow33:
; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -157,15 +157,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.11.bb84:
; GFX90A-NEXT: successors: %bb.12(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
- ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.12.Flow34:
; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -174,10 +174,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.13.bb79:
; GFX90A-NEXT: successors: %bb.14(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
- ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_OR_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.14.Flow35:
@@ -359,7 +359,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.35.bb20:
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -376,37 +376,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.36.Flow21:
; GFX90A-NEXT: successors: %bb.6(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
; GFX90A-NEXT: S_BRANCH %bb.6
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.37.bb27:
; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30)
; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
@@ -416,29 +416,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.38.Flow22:
; GFX90A-NEXT: successors: %bb.36(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -459,7 +459,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.39.bb34:
; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37)
; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
@@ -469,28 +469,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.40.Flow23:
; GFX90A-NEXT: successors: %bb.38(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -510,40 +510,39 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.41.bb41:
; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
- ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc
- ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44)
+ ; GFX90A-NEXT: renamable $vgpr1, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, $vcc, 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $vgpr58, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44)
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
- ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr59, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.42.Flow24:
; GFX90A-NEXT: successors: %bb.40(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
- ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr3, implicit $exec
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
@@ -560,11 +559,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.43.bb55:
; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc
- ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc
- ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
+ ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc
@@ -572,27 +571,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.44:
; GFX90A-NEXT: successors: %bb.45(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr62, $vgpr56, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr57, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr63, $vgpr58
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr62, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr4, $vgpr5, $vgpr6, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr58, $vgpr60, $vgpr63, $vgpr59
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.45.Flow26:
; GFX90A-NEXT: successors: %bb.47(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
@@ -608,7 +607,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.46.bb48:
; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc
@@ -620,26 +619,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.47.Flow25:
; GFX90A-NEXT: successors: %bb.42(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -657,139 +656,139 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.48.bb63:
; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000)
- ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
+ ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.49:
; GFX90A-NEXT: successors: %bb.44(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1
; GFX90A-NEXT: S_BRANCH %bb.44
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.50.bb68:
; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec
; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.51:
; GFX90A-NEXT: successors: %bb.45(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: S_BRANCH %bb.45
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.52.bb80:
; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
- ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.53:
; GFX90A-NEXT: successors: %bb.61(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1
- ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
- ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: S_BRANCH %bb.61
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.54.bb73:
; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr8 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
- ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
+ ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
- ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
- ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr8, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr62_sgpr63 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.55.Flow29:
; GFX90A-NEXT: successors: %bb.45(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr62_sgpr63, implicit-def $scc
; GFX90A-NEXT: S_BRANCH %bb.45
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.56.bb90:
; GFX90A-NEXT: successors: %bb.60(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr12 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $sgpr21, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr14, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $sgpr22, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr7 = COPY renamable $sgpr21, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr7, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr3, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr3 = COPY renamable $sgpr22, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr3, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_LSHR_B64 killed renamable $sgpr56_sgpr57, 1, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr32_vgpr33 = V_LSHRREV_B64_e64 1, $vgpr20_vgpr21, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = V_LSHRREV_B64_e64 1, $vgpr24_vgpr25, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr7 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr20, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr22, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.60
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.57:
; GFX90A-NEXT: successors: %bb.7(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr30 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0
@@ -800,9 +799,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0
; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF
@@ -812,7 +811,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr17 = COPY renamable $vgpr5, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr7 = COPY renamable $vgpr5, implicit $exec
; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0
; GFX90A-NEXT: S_BRANCH %bb.7
; GFX90A-NEXT: {{ $}}
@@ -821,62 +820,62 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
; GFX90A-NEXT: renamable $vgpr2_vgpr3 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr32_vgpr33 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1
; GFX90A-NEXT: S_BRANCH %bb.3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.59.bb85:
; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr12 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
+ ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 1, $vgpr10, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr11, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr12_vgpr13, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1
- ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr12, implicit $exec
- ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
- ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37
+ ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr24 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.60.Flow31:
; GFX90A-NEXT: successors: %bb.61(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.61.Flow30:
; GFX90A-NEXT: successors: %bb.55(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
- ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+ ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
; GFX90A-NEXT: S_BRANCH %bb.55
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.62.bb140:
; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -884,122 +883,122 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.63.Flow13:
; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.64.bb159:
; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
+ ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr6, implicit $exec
; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.65.Flow10:
; GFX90A-NEXT: successors: %bb.66(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.66.Flow14:
; GFX90A-NEXT: successors: %bb.8(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec
; GFX90A-NEXT: S_BRANCH %bb.8
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.67.bb161:
; GFX90A-NEXT: successors: %bb.65(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr25, killed $vgpr27, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr29, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr15, killed $vgpr3, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr27, killed $vgpr29, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr33, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr17, killed $vgpr3, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr12, 0, $vgpr3, 0, 0, 6, implicit $exec
+ ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr30, 0, $vgpr3, 0, 0, 6, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr16, killed $vgpr19, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr18, killed $vgpr21, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec
- ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
+ ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr7, 0, $vgpr3, 0, 0, 6, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr32, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr14, implicit $exec
; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
; GFX90A-NEXT: S_BRANCH %bb.65
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.68.bb174:
; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
- ; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $agpr0 = COPY killed renamable $vgpr32, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 1, $vgpr28, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr52 = V_OR_B32_e32 $vgpr32, $vgpr26, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr52, $vgpr24, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr34 = V_CNDMASK_B32_e64 0, $vgpr38, 0, 0, $sgpr12_sgpr13, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr34, $vgpr2, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, $vgpr14, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr48, $vgpr18, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr36, killed $sgpr12_sgpr13, implicit $exec
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000F, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: renamable $agpr0 = COPY killed renamable $vgpr14, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 1, $vgpr32, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 $vgpr34, $vgpr28, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr54, $vgpr26, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr36 = V_CNDMASK_B32_e64 0, $vgpr48, 0, 0, $sgpr12_sgpr13, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr52 = V_OR_B32_e32 $vgpr36, $vgpr2, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr52, $vgpr16, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr50, $vgpr20, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr14 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr38, killed $sgpr12_sgpr13, implicit $exec
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.69.Flow:
; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.70.bb186:
; GFX90A-NEXT: successors: %bb.71(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec
; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec
; GFX90A-NEXT: renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec
; GFX90A-NEXT: renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr33 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr55 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr33, implicit $exec
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr35 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr55 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr15 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr35, implicit $exec
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr35, renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
- ; GFX90A-NEXT: renamable $vgpr13 = COPY killed renamable $sgpr22, implicit $exec
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr13, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr33, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr16 = COPY killed renamable $sgpr22, implicit $exec
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr16, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr35, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr35, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr14_vgpr15, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr35, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.71.Flow9:
; GFX90A-NEXT: successors: %bb.63(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_MOV_B64 0
- ; GFX90A-NEXT: renamable $vgpr32 = COPY killed renamable $agpr0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr14 = COPY killed renamable $agpr0, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.63
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.72.bb196:
; GFX90A-NEXT: successors: %bb.69(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x000000000000000C, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x000000000000000C, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr54, killed $vgpr22, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr20 = V_OR_B32_e32 killed $vgpr2, killed $vgpr20, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr21, renamable $vgpr20_vgpr21, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+ ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr14, killed $vgpr24, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr22 = V_OR_B32_e32 killed $vgpr2, killed $vgpr22, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr23, renamable $vgpr22_vgpr23, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0
; GFX90A-NEXT: S_BRANCH %bb.69
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
index 2abd7edade8a1..99f1cce0c9055 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll
@@ -16,7 +16,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offset_rtn(<2 x half> %val
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -36,7 +36,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_offen_rtn(<2 x half> %val,
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -56,7 +56,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_idxen_rtn(<2 x half> %val,
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -78,7 +78,7 @@ define amdgpu_ps <2 x half> @buffer_atomic_fadd_v2f16_bothen_rtn(<2 x half> %val
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
@@ -103,7 +103,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offset_rtn(<2 x half>
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
@@ -129,7 +129,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_offen_rtn(<2 x half> %
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -155,7 +155,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_idxen_rtn(<2 x half> %
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -183,7 +183,7 @@ define amdgpu_ps <2 x half> @buffer_ptr_atomic_fadd_v2f16_bothen_rtn(<2 x half>
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+ ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 04f8ad8a02303..931a62298812f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -258,68 +258,59 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16
; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: s_nop 1
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -799,68 +790,59 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16
; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: s_nop 1
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -1158,8 +1140,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen
; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
@@ -1170,12 +1152,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s0
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: s_endpgm
;
; SDAG-GFX1100-LABEL: memcpy_known_small:
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index fccee3da6d77e..5f965ba431ab5 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -1553,13 +1553,13 @@ define void @too_many_args_use_workitem_id_xyz(
; GFX90A-LABEL: too_many_args_use_workitem_id_xyz:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX90A-NEXT: v_and_b32_e32 v33, 0x3ff, v31
-; GFX90A-NEXT: global_store_dword v[0:1], v33, off
+; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32
+; GFX90A-NEXT: v_and_b32_e32 v32, 0x3ff, v31
+; GFX90A-NEXT: global_store_dword v[0:1], v32, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_bfe_u32 v33, v31, 10, 10
+; GFX90A-NEXT: v_bfe_u32 v32, v31, 10, 10
; GFX90A-NEXT: v_bfe_u32 v31, v31, 20, 10
-; GFX90A-NEXT: global_store_dword v[0:1], v33, off
+; GFX90A-NEXT: global_store_dword v[0:1], v32, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_store_dword v[0:1], v31, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1625,7 +1625,7 @@ define void @too_many_args_use_workitem_id_xyz(
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_store_dword v[0:1], v30, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dword v[0:1], v32, off
+; GFX90A-NEXT: global_store_dword v[0:1], v33, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index 0e86a1ac68119..56de9dde7c310 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -120,19 +120,19 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_mov_b32_e32 v3, v0
-; CHECK-NEXT: v_bfi_b32 v2, v3, -2, -1
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: v_bfi_b32 v0, v1, -2, -1
+; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 5c4e25c3120e9..2079c864c29d3 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -8485,27 +8485,27 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
; GFX942-NEXT: v_and_b32_e32 v3, 3, v3
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB36_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8630,26 +8630,26 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8858,30 +8858,30 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB37_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9010,26 +9010,26 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9243,30 +9243,30 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9395,26 +9395,26 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10978,13 +10978,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_add_f16_e32 v3, v5, v2
; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX942-NEXT: buffer_wbl2 sc1
@@ -10993,6 +10992,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB43_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11087,13 +11087,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
@@ -11101,6 +11100,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11289,30 +11289,30 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11441,28 +11441,28 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12080,36 +12080,36 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB46_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -12262,33 +12262,33 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -12533,41 +12533,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB47_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12717,41 +12717,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13001,41 +13001,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13185,41 +13185,41 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14346,34 +14346,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB51_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -14495,32 +14495,32 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -15546,41 +15546,41 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB54_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15730,43 +15730,43 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16429,18 +16429,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16618,18 +16618,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16819,18 +16819,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17584,12 +17584,11 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
@@ -17598,6 +17597,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB62_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17966,18 +17966,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB64_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18334,18 +18334,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB66_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18796,37 +18796,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB68_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -19120,37 +19120,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -19453,36 +19453,36 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20759,39 +20759,39 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB74_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -21409,37 +21409,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB76_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
@@ -22045,37 +22045,37 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB78_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 53e9468c5d5b6..d07b2b412acef 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -6125,29 +6125,29 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -6279,28 +6279,28 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -6519,34 +6519,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6676,36 +6676,36 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6929,34 +6929,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7086,36 +7086,36 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8481,26 +8481,26 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v4, v2, v2
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -8599,25 +8599,25 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9110,34 +9110,34 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB34_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9267,38 +9267,38 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9940,36 +9940,36 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB36_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -10122,33 +10122,33 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -10394,41 +10394,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB37_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10578,41 +10578,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10863,41 +10863,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11047,41 +11047,41 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12649,34 +12649,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB42_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12798,32 +12798,32 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -13414,41 +13414,41 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13598,43 +13598,43 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14261,25 +14261,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB46_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -14341,23 +14341,23 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -14497,25 +14497,25 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB47_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14578,23 +14578,23 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14734,21 +14734,18 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX942-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX942-NEXT: flat_load_dword v3, v[4:5]
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_pk_max_f16 v1, v2, v2
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_pk_max_f16 v0, v3, v3
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_pk_max_f16 v2, v0, v1
@@ -14758,6 +14755,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14829,20 +14827,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v0, v1
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15707,25 +15705,25 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15788,25 +15786,25 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16263,39 +16261,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB54_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16441,37 +16439,37 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16710,39 +16708,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB55_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16889,37 +16887,37 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17156,46 +17154,44 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX942-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX942-NEXT: flat_load_dword v3, v[4:5]
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX942-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v0, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v2, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17350,36 +17346,36 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18962,39 +18958,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB60_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -19141,39 +19137,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 5ee3ff67aa8a0..4eb8fdb0b0999 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -6125,29 +6125,29 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_min_f16_e32 v4, v4, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -6279,28 +6279,28 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -6519,34 +6519,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6676,36 +6676,36 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6929,34 +6929,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7086,36 +7086,36 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8481,26 +8481,26 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v4, v2, v2
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -8599,25 +8599,25 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9110,34 +9110,34 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB34_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9267,38 +9267,38 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9940,36 +9940,36 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB36_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -10122,33 +10122,33 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -10394,41 +10394,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB37_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10578,41 +10578,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10863,41 +10863,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11047,41 +11047,41 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12649,34 +12649,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB42_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12798,32 +12798,32 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -13414,41 +13414,41 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13598,43 +13598,43 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX90A-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14261,25 +14261,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB46_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -14341,23 +14341,23 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -14497,25 +14497,25 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB47_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14578,23 +14578,23 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14734,21 +14734,18 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX942-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX942-NEXT: flat_load_dword v3, v[4:5]
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_pk_max_f16 v1, v2, v2
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_pk_max_f16 v0, v3, v3
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_pk_min_f16 v2, v0, v1
@@ -14758,6 +14755,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14829,20 +14827,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15707,25 +15705,25 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15788,25 +15786,25 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16263,39 +16261,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB54_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16441,37 +16439,37 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16710,39 +16708,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB55_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16889,37 +16887,37 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17156,46 +17154,44 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX942-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX942-NEXT: flat_load_dword v3, v[4:5]
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX942-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v0, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v2, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17350,36 +17346,36 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18962,39 +18958,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB60_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -19141,39 +19137,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 0e563c26d27ea..7813cb4226dad 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -5918,27 +5918,27 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
; GFX942-NEXT: v_and_b32_e32 v3, 3, v3
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB22_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6063,26 +6063,26 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6291,30 +6291,30 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB23_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6443,26 +6443,26 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6676,30 +6676,30 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB24_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6828,26 +6828,26 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8137,13 +8137,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2
; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX942-NEXT: buffer_wbl2 sc1
@@ -8152,6 +8151,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8246,13 +8246,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
@@ -8260,6 +8259,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8722,30 +8722,30 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: flat_load_dword v4, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB30_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8874,28 +8874,28 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9513,36 +9513,36 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
@@ -9695,33 +9695,33 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16:
@@ -9966,41 +9966,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB33_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10150,41 +10150,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10434,41 +10434,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB34_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -10618,41 +10618,41 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -12216,34 +12216,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -12365,32 +12365,32 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -12979,41 +12979,41 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: flat_load_dword v5, v[0:1]
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB40_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -13163,43 +13163,43 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX90A-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB40_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -13820,12 +13820,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: flat_load_dword v3, v[0:1]
+; GFX942-NEXT: flat_load_dword v5, v[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
@@ -13833,6 +13832,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB42_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13893,18 +13893,18 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14039,12 +14039,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
@@ -14052,6 +14051,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB43_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14113,18 +14113,18 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX90A-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14261,27 +14261,25 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX942-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX942-NEXT: flat_load_dword v7, v[4:5]
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX942-NEXT: v_pk_add_f16 v6, v7, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14349,18 +14347,18 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15158,12 +15156,11 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX942-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
@@ -15171,6 +15168,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15232,12 +15230,11 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX90A-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
@@ -15246,6 +15243,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15682,39 +15680,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB50_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
@@ -15860,37 +15858,37 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
@@ -16129,39 +16127,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB51_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -16308,37 +16306,37 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -16575,46 +16573,44 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
-; GFX942-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX942-NEXT: flat_load_dword v3, v[4:5]
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; GFX942-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v0, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v2, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v2, v0, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16769,36 +16765,36 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18381,39 +18377,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -18560,39 +18556,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 8a4b2c428e31a..1a1437a25a1fe 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -3626,7 +3626,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB36_2
; GFX950-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
@@ -3794,7 +3794,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB37_2
; GFX950-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
@@ -4512,7 +4512,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB44_2
; GFX950-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
@@ -4680,7 +4680,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB45_2
; GFX950-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
@@ -5398,7 +5398,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB52_2
; GFX950-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
@@ -5566,7 +5566,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB53_2
; GFX950-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
@@ -13018,21 +13018,20 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
-; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
-; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-SDAG-NEXT: s_nop 0
-; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v1
-; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1
; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13184,21 +13183,20 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
-; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
-; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-SDAG-NEXT: s_nop 0
-; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v0, v1
-; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-SDAG-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1
; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13396,24 +13394,23 @@ define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: v_mov_b32_e32 v1, v0
-; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v7, v0
-; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
-; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX950-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0
-; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB130_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13513,24 +13510,23 @@ define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: v_mov_b32_e32 v1, v0
-; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v7, v0
-; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
-; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX950-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0
-; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB132_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index 3d0ebc72791bd..39fa342d2a66f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -36,37 +36,36 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %
; GFX90A-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:av_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1)
; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]]
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1.atomicrmw.start:
; GFX90A-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1
- ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY7]], %bb.0, %3, %bb.1
+ ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, %3, %bb.1
; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1
- ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0
- ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
- ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
- ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3
- ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]]
- ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY12]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1)
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_32 = COPY [[V_ADD_F64_e64_]].sub1
+ ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_ADD_F64_e64_]].sub0
+ ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:av_32 = COPY [[PHI1]].sub1
+ ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:av_32 = COPY [[PHI1]].sub0
+ ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
+ ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]]
+ ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1)
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec
- ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
+ ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]]
; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc
; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.2
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2.atomicrmw.end:
- ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY13]], %bb.1
+ ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY12]], %bb.1
; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1
; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
- ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
- ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
- ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+ ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+ ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 1978e68fdae9c..0a40e62b4ed3a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -8960,27 +8960,27 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
; GFX942-NEXT: v_and_b32_e32 v3, 3, v3
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9105,26 +9105,26 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9469,30 +9469,30 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB45_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9621,26 +9621,26 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB45_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9993,30 +9993,30 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB46_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -10145,26 +10145,26 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11972,13 +11972,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_add_f16_e32 v3, v5, v2
; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX942-NEXT: buffer_wbl2 sc1
@@ -11987,6 +11986,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB50_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12080,13 +12080,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
@@ -12094,6 +12093,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12780,30 +12780,30 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_add_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12932,28 +12932,28 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13852,36 +13852,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB54_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -14034,33 +14034,33 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -14450,41 +14450,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB55_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14634,41 +14634,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15068,41 +15068,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15252,41 +15252,41 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -17409,34 +17409,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB60_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -17557,32 +17557,32 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -18417,41 +18417,41 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB62_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -18601,43 +18601,43 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB62_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -21397,18 +21397,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB72_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22365,18 +22365,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac
; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB76_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22963,37 +22963,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB78_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -23351,37 +23351,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB79_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -23741,37 +23741,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB80_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -25278,39 +25278,39 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB84_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -26052,37 +26052,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
@@ -26816,37 +26816,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -27580,37 +27580,37 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB90_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index dc995fb7ef79c..dec3dd79f60d1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4548,29 +4548,29 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4702,28 +4702,28 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4992,34 +4992,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5149,36 +5149,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5454,34 +5454,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5611,36 +5611,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7207,26 +7207,26 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v4, v2, v2
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7324,25 +7324,25 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7912,34 +7912,34 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB34_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8069,38 +8069,38 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8844,36 +8844,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB36_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9026,33 +9026,33 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9349,41 +9349,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB37_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9533,41 +9533,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9871,41 +9871,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10055,41 +10055,41 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11862,34 +11862,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB42_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12010,32 +12010,32 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12705,41 +12705,41 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12889,43 +12889,43 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13656,25 +13656,25 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB46_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13736,23 +13736,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13949,25 +13949,25 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB47_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14029,23 +14029,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14244,25 +14244,25 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14324,23 +14324,23 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15404,25 +15404,25 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_max_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15484,25 +15484,25 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16067,39 +16067,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB54_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16245,37 +16245,37 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16567,39 +16567,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB55_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16745,37 +16745,37 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17069,39 +17069,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -17247,37 +17247,37 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -19040,39 +19040,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB60_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -19218,39 +19218,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index f62e13a9d4341..ce087737b852a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4548,29 +4548,29 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
; GFX942-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX942-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX942-NEXT: v_min_f16_e32 v4, v4, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB26_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4702,28 +4702,28 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4992,34 +4992,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB27_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5149,36 +5149,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5454,34 +5454,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5611,36 +5611,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7207,26 +7207,26 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v4, v2, v2
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX942-NEXT: v_and_or_b32 v2, v3, s2, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7324,25 +7324,25 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7912,34 +7912,34 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_max_f16_e32 v6, v2, v2
; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX942-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX942-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX942-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX942-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB34_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8069,38 +8069,38 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7
-; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8844,36 +8844,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB36_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9026,33 +9026,33 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9349,41 +9349,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB37_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9533,41 +9533,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9871,41 +9871,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10055,41 +10055,41 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11862,34 +11862,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB42_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12010,32 +12010,32 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12705,41 +12705,41 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12889,43 +12889,43 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13656,25 +13656,25 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB46_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13736,23 +13736,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13949,25 +13949,25 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB47_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14029,23 +14029,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14244,25 +14244,25 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14324,23 +14324,23 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15404,25 +15404,25 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX942-NEXT: v_pk_max_f16 v4, v2, v2
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX942-NEXT: v_pk_max_f16 v2, v3, v3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX942-NEXT: v_pk_min_f16 v2, v2, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15484,25 +15484,25 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16067,39 +16067,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB54_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16245,37 +16245,37 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16567,39 +16567,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB55_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16745,37 +16745,37 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17069,39 +17069,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -17247,37 +17247,37 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -19040,39 +19040,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB60_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -19218,39 +19218,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 9e6f0fd7f13b5..f72296b68bea2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -5290,27 +5290,27 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
; GFX942-NEXT: v_and_b32_e32 v3, 3, v3
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB22_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5435,26 +5435,26 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5713,30 +5713,30 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB23_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5865,26 +5865,26 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6150,30 +6150,30 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_movk_i32 s0, 0xf800
; GFX942-NEXT: s_mov_b32 s1, -1
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB24_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6302,26 +6302,26 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7812,13 +7812,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: s_mov_b32 s2, 0xffff0000
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_sub_f16_e32 v3, v5, v2
; GFX942-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX942-NEXT: buffer_wbl2 sc1
@@ -7827,6 +7826,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7920,13 +7920,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_sub_f16_e32 v3, v5, v2
; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
@@ -7934,6 +7933,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8473,30 +8473,30 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX942-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX942-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_load_dword v4, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v5, v5
+; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX942-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX942-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX942-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB30_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8625,28 +8625,28 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_sub_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9366,36 +9366,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX942-NEXT: s_mov_b32 s0, 0xffff
; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_not_b32_e32 v6, v4
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX942-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX942-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB32_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16:
@@ -9548,33 +9548,33 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16:
@@ -9869,41 +9869,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB33_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10053,41 +10053,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX90A-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10389,41 +10389,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB34_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -10573,41 +10573,41 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX90A-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -12372,34 +12372,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: s_mov_b32 s3, 0xffff0000
; GFX942-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB38_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -12520,32 +12520,32 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -13211,41 +13211,41 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX942-NEXT: v_and_b32_e32 v0, -4, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_load_dword v5, v[0:1], off
-; GFX942-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX942-NEXT: s_mov_b32 s0, 0xffff
-; GFX942-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX942-NEXT: v_not_b32_e32 v4, v4
+; GFX942-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX942-NEXT: v_not_b32_e32 v5, v5
; GFX942-NEXT: s_mov_b64 s[0:1], 0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX942-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX942-NEXT: s_movk_i32 s2, 0x7fff
; GFX942-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX942-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX942-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB40_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -13395,43 +13395,43 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX90A-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB40_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -14154,12 +14154,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: global_load_dword v5, v[0:1], off
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
@@ -14167,6 +14166,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB42_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14227,18 +14227,18 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX90A-LABEL: global_agent_atomic_fsub_ret_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14430,12 +14430,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
@@ -14443,6 +14442,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB43_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14503,18 +14503,18 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14708,12 +14708,11 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX942-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
@@ -14721,6 +14720,7 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB44_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14781,18 +14781,18 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX90A-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15794,12 +15794,11 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX942-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX942-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
@@ -15807,6 +15806,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB48_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -15867,12 +15867,11 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX90A-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
@@ -15881,6 +15880,7 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16425,39 +16425,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB50_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16:
@@ -16603,37 +16603,37 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_v2bf16:
@@ -16925,39 +16925,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB51_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -17103,37 +17103,37 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -17427,39 +17427,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB52_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -17605,37 +17605,37 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -19398,39 +19398,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX942-NEXT: s_mov_b64 s[2:3], 0
; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX942-NEXT: s_movk_i32 s4, 0x7fff
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX942-NEXT: s_mov_b32 s5, 0x7060302
; GFX942-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX942-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX942-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX942-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX942-NEXT: v_add3_u32 v6, v6, v3, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v5, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX942-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX942-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX942-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX942-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX942-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX942-NEXT: v_add3_u32 v7, v7, v2, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v6, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v5, v3, s5
+; GFX942-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v2, v6, v2, s5
; GFX942-NEXT: buffer_wbl2 sc0 sc1
-; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1
+; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc0 sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_cbranch_execnz .LBB56_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -19576,39 +19576,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index 57bfd2490f9da..d973f7b71fb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -19,11 +19,11 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half8:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; GFX90A-NEXT: global_load_dwordx4 v[2:5], v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half8:
@@ -85,11 +85,11 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GFX90A-NEXT: global_load_dwordx3 v[2:4], v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX90A-NEXT: global_store_dwordx3 v0, v[2:4], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half6:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 24512f2f7905a..e1a9935b790f3 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -162,12 +162,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX90A-LABEL: atomic_nand_i32_global:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: v_bfi_b32 v2, v3, -5, -1
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
@@ -176,6 +175,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index d23509b5aa812..f99718de97765 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -49,10 +49,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_barrier
-; GFX90A-NEXT: ds_read_b32 v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: ds_read_b32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: barrier_release:
@@ -72,10 +72,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_barrier
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-LABEL: barrier_release:
@@ -94,10 +94,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_barrier
-; GFX942-NEXT: ds_read_b32 v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: ds_read_b32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: barrier_release:
@@ -117,10 +117,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: s_barrier
; GFX942-TGSPLIT-NEXT: buffer_inv sc0
-; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX10WGP-LABEL: barrier_release:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index 3e96dfe40f745..a57b43a81205b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -37,11 +37,11 @@ entry:
define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
; GFX950-SDAG: ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
-; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[4:6], v0 offset:32
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
+; GFX950-SDAG-NEXT: global_store_dwordx3 v[2:3], v[4:6], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index e174fc17e98fe..1e6aea593065c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -159,100 +159,100 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v3, s0, v0
-; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v3
-; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v3
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
-; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304
-; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288
-; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272
-; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256
-; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240
-; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224
-; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208
-; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:8192
; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
-; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
-; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
-; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:49152
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; GCN-NEXT: ds_write_b128 v0, a[0:3]
+; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[128:131]
; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288
-; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304
-; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256
-; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272
-; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224
-; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240
-; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192
-; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208
-; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480
-; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496
-; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448
-; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464
-; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416
-; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432
-; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384
-; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400
-; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672
-; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688
-; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640
-; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656
-; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608
-; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624
-; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576
-; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592
-; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864
-; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880
-; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832
-; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848
-; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800
-; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816
-; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768
-; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
+; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; GCN-NEXT: s_endpgm
entry:
call void @llvm.amdgcn.iglp.opt(i32 1)
@@ -294,17 +294,17 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias
; GCN-NEXT: ; iglp_opt mask(0x00000000)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b32 v1, v1
+; GCN-NEXT: ds_read_b32 v2, v1
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b32 v0, v1
+; GCN-NEXT: ds_write_b32 v0, v2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: ds_read_b32 v0, v2 offset:256
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: ds_read_b32 v1, v1 offset:256
+; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b32 v1, v0 offset:256
+; GCN-NEXT: ds_write_b32 v0, v1 offset:256
; GCN-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index bc72687e260e7..ff3de9e05e897 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -26,7 +26,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0
; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1
; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112
@@ -48,7 +47,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
@@ -71,7 +69,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
@@ -94,7 +91,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
@@ -117,7 +113,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
@@ -130,6 +125,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768
; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: s_endpgm
;
; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
@@ -495,8 +495,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288
@@ -520,7 +520,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
@@ -543,7 +542,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
@@ -566,7 +564,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
@@ -579,6 +576,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784
; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: s_endpgm
;
; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave_split_region:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index b65a1a8e06c7d..19bbfe1cdd9bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -636,48 +636,48 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32
; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48
; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
-; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304
-; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
-; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
-; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:8192
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:24576
+; GCN-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[32:35], v3 offset:49152
+; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392
; GCN-NEXT: s_waitcnt lgkmcnt(14)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(8)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-NEXT: s_nop 11
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
@@ -688,38 +688,38 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16
; GCN-NEXT: ds_write_b128 v0, a[128:131]
; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480
-; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496
-; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448
-; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464
-; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416
-; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432
-; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384
-; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400
-; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672
-; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688
-; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640
-; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656
-; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608
-; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624
-; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576
-; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592
-; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864
-; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880
-; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832
-; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848
-; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800
-; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816
-; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768
-; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
+; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0)
; GCN-NEXT: s_endpgm
@@ -742,48 +742,48 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32
; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48
; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576
-; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152
-; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:8192
+; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:24576
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v3 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v4 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v4 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v4 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v4 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v4 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v4 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v4 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v4 offset:57392
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; EXACTCUTOFF-NEXT: s_nop 11
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
@@ -794,38 +794,38 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131]
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:16480
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:16496
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:16448
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:16464
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:16416
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:16432
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:16384
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16400
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:24672
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:24688
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:24640
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:24656
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:24608
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:24624
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:24576
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:24592
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:32864
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:32880
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:32832
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:32848
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:32800
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:32816
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:32768
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:32784
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:8288
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:8304
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:8256
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:8272
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:8224
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:8240
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:8208
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(40) SyncID(0)
; EXACTCUTOFF-NEXT: s_endpgm
@@ -1202,57 +1202,57 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_add_u32_e32 v1, s6, v0
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[96:99], v1
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:48
-; GCN-NEXT: v_mov_b32_e32 v9, 1.0
+; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
+; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v1
+; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
+; GCN-NEXT: v_mul_f32_e32 v9, s1, v3
+; GCN-NEXT: v_mov_b32_e32 v12, 1.0
; GCN-NEXT: v_ldexp_f32 v4, v4, v5
; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; GCN-NEXT: v_mul_f32_e32 v10, s1, v3
+; GCN-NEXT: v_rndne_f32_e32 v10, v9
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5
; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218
-; GCN-NEXT: v_rndne_f32_e32 v11, v10
+; GCN-NEXT: v_sub_f32_e32 v11, v9, v10
+; GCN-NEXT: v_fma_f32 v9, s1, v3, -v9
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6
; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000
-; GCN-NEXT: v_sub_f32_e32 v12, v10, v11
-; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10
+; GCN-NEXT: v_fmac_f32_e32 v9, s1, v7
; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; GCN-NEXT: v_add_f32_e32 v9, v11, v9
+; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304
; GCN-NEXT: s_waitcnt lgkmcnt(1)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127]
-; GCN-NEXT: v_add_f32_e32 v4, v12, v10
-; GCN-NEXT: v_exp_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT: v_ldexp_f32 v4, v4, v10
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v12, v4, a[128:159]
+; GCN-NEXT: v_exp_f32_e32 v4, v9
+; GCN-NEXT: v_cvt_i32_f32_e32 v9, v10
+; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288
+; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272
+; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256
+; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240
+; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224
+; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208
+; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192
+; GCN-NEXT: v_ldexp_f32 v4, v4, v9
+; GCN-NEXT: v_mul_f32_e32 v9, s2, v3
+; GCN-NEXT: v_rndne_f32_e32 v10, v9
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5
+; GCN-NEXT: v_sub_f32_e32 v11, v9, v10
+; GCN-NEXT: v_fma_f32 v9, s2, v3, -v9
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6
+; GCN-NEXT: v_fmac_f32_e32 v9, s2, v7
; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; GCN-NEXT: v_mul_f32_e32 v10, s2, v3
-; GCN-NEXT: v_rndne_f32_e32 v11, v10
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
-; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10
-; GCN-NEXT: v_sub_f32_e32 v12, v10, v11
-; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7
-; GCN-NEXT: v_add_f32_e32 v4, v12, v4
-; GCN-NEXT: v_exp_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11
+; GCN-NEXT: v_add_f32_e32 v9, v11, v9
; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688
+; GCN-NEXT: s_waitcnt lgkmcnt(1)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v12, v4, a[96:127]
+; GCN-NEXT: v_exp_f32_e32 v4, v9
+; GCN-NEXT: v_cvt_i32_f32_e32 v9, v10
; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672
; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656
; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640
@@ -1269,60 +1269,60 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184
; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168
; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152
-; GCN-NEXT: v_ldexp_f32 v1, v4, v10
+; GCN-NEXT: v_ldexp_f32 v1, v4, v9
+; GCN-NEXT: v_mul_f32_e32 v4, s3, v3
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
+; GCN-NEXT: v_rndne_f32_e32 v9, v4
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
-; GCN-NEXT: v_mul_f32_e32 v4, s3, v3
+; GCN-NEXT: v_sub_f32_e32 v10, v4, v9
+; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4
; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GCN-NEXT: v_rndne_f32_e32 v10, v4
+; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7
; GCN-NEXT: s_load_dword s8, s[4:5], 0x54
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95]
-; GCN-NEXT: v_sub_f32_e32 v1, v4, v10
-; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4
-; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7
-; GCN-NEXT: v_add_f32_e32 v1, v1, v4
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v12, v1, a[64:95]
+; GCN-NEXT: v_add_f32_e32 v1, v10, v4
; GCN-NEXT: v_exp_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10
+; GCN-NEXT: v_cvt_i32_f32_e32 v4, v9
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
-; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456
-; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440
+; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456
+; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440
; GCN-NEXT: v_ldexp_f32 v1, v1, v4
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
-; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v4, s8, v3
+; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; GCN-NEXT: v_rndne_f32_e32 v9, v4
; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63]
-; GCN-NEXT: v_rndne_f32_e32 v1, v4
-; GCN-NEXT: v_sub_f32_e32 v10, v4, v1
; GCN-NEXT: v_fmac_f32_e32 v3, s8, v7
-; GCN-NEXT: v_add_f32_e32 v3, v10, v3
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT: ds_read_b128 a[148:151], v2 offset:57424
-; GCN-NEXT: ds_read_b128 a[144:147], v2 offset:57408
-; GCN-NEXT: ds_read_b128 a[128:131], v2 offset:57344
-; GCN-NEXT: ds_read_b128 a[132:135], v2 offset:57360
-; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376
-; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392
-; GCN-NEXT: v_ldexp_f32 v1, v3, v1
+; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v12, v1, a[32:63]
+; GCN-NEXT: v_sub_f32_e32 v1, v4, v9
+; GCN-NEXT: v_add_f32_e32 v1, v1, v3
+; GCN-NEXT: v_exp_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_i32_f32_e32 v3, v9
+; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408
+; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344
+; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360
+; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376
+; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392
+; GCN-NEXT: v_ldexp_f32 v1, v1, v3
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6
; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; GCN-NEXT: v_add_u32_e32 v0, s7, v0
-; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
; GCN-NEXT: s_waitcnt lgkmcnt(1)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159]
-; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:96
-; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:80
-; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:64
-; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:48
-; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32
-; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16
-; GCN-NEXT: ds_write_b128 v0, a[96:99]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v12, v1, a[0:31]
+; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[128:131]
; GCN-NEXT: v_mov_b32_e32 v0, s7
; GCN-NEXT: ; kill: killed $sgpr4_sgpr5
; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
@@ -1335,14 +1335,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
@@ -1359,14 +1359,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
-; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864
-; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880
-; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832
-; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848
-; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800
-; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816
-; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768
-; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; GCN-NEXT: s_endpgm
;
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA:
@@ -1387,57 +1387,57 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:48
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0
+; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1
+; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48
+; EXACTCUTOFF-NEXT: v_mul_f32_e32 v9, s1, v3
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v12, 1.0
; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3
+; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v9
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218
-; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10
+; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v9, v10
+; EXACTCUTOFF-NEXT: v_fma_f32 v9, s1, v3, -v9
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000
-; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11
-; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10
+; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v9, s1, v7
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; EXACTCUTOFF-NEXT: v_add_f32_e32 v9, v11, v9
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:8304
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v9, v4, a[96:127]
-; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v10
-; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4
-; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v12, v4, a[128:159]
+; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v9
+; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v10
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:8192
+; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v9
+; EXACTCUTOFF-NEXT: v_mul_f32_e32 v9, s2, v3
+; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v9
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5
+; EXACTCUTOFF-NEXT: v_sub_f32_e32 v11, v9, v10
+; EXACTCUTOFF-NEXT: v_fma_f32 v9, s2, v3, -v9
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6
+; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v9, s2, v7
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
-; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3
-; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31]
-; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10
-; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11
-; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7
-; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4
-; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4
-; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11
+; EXACTCUTOFF-NEXT: v_add_f32_e32 v9, v11, v9
; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v12, v4, a[96:127]
+; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v9
+; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v9, v10
; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672
; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656
; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640
@@ -1454,60 +1454,60 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:49184
; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:49168
; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:49152
-; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10
+; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v9
+; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
+; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v4
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
-; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3
+; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v9
+; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4
+; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7
; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95]
-; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10
-; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4
-; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7
-; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v12, v1, a[64:95]
+; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v10, v4
; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1
-; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10
+; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v9
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
-; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v2 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v2 offset:57440
; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
-; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3
+; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v9, v4
; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v1, a[32:63]
-; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v1, v4
-; EXACTCUTOFF-NEXT: v_sub_f32_e32 v10, v4, v1
; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v3, s8, v7
-; EXACTCUTOFF-NEXT: v_add_f32_e32 v3, v10, v3
-; EXACTCUTOFF-NEXT: v_exp_f32_e32 v3, v3
-; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v1, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v2 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v2 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v2 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v2 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392
-; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v3, v1
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v2 offset:57424
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v12, v1, a[32:63]
+; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v9
+; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v3
+; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1
+; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v3, v9
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v2 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v2 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v2 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v2 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v2 offset:57392
+; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v3
; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s8, v5
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6
; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:112
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159]
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:96
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:80
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:64
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:48
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:32
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:16
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v12, v1, a[0:31]
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:64
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:48
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:16
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131]
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7
; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
@@ -1520,14 +1520,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:8288
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:8304
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:8256
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:8272
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:8224
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:8240
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:8208
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448
@@ -1544,14 +1544,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:24624
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:24576
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:24592
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; EXACTCUTOFF-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 1d08097452ce6..347fddbedb0a7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -20,19 +20,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
-; SDAG-NEXT: v_mov_b32_e32 v5, s16
-; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15]
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
@@ -518,19 +518,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
+; GCN-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
-; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
-; GCN-NEXT: v_mov_b32_e32 v5, s16
-; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15]
+; GCN-NEXT: v_mov_b32_e32 v1, s16
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; GCN-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -797,12 +797,12 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
@@ -811,12 +811,12 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
@@ -1308,12 +1308,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
@@ -1322,12 +1322,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
@@ -1470,12 +1470,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
@@ -1484,12 +1484,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
@@ -1632,12 +1632,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
@@ -1646,12 +1646,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
@@ -1794,12 +1794,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
@@ -1808,12 +1808,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_mov_b32_e32 v5, s1
; SDAG-NEXT: v_mov_b32_e32 v6, s2
; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index bfce9bcac22c7..929bb61ddabcf 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -50,17 +50,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX942-LABEL: local_atomic_fsub_ret_f32:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v1, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -119,17 +119,17 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX90A-LABEL: local_atomic_fsub_ret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v1, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -262,17 +262,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-LABEL: local_atomic_fsub_ret_f32__offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX942-NEXT: ds_read_b32 v1, v0 offset:65532
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -331,17 +331,17 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX90A-LABEL: local_atomic_fsub_ret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v1, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9339,17 +9339,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX942-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: ds_read_b32 v2, v0
+; GFX942-NEXT: ds_read_b32 v1, v0
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9408,17 +9408,17 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX90A-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v2, v0
+; GFX90A-NEXT: ds_read_b32 v1, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_add_f32_e32 v1, -4.0, v2
; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 84db54c2d537f..4eeb98bc11ea3 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
+; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -87,9 +87,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-SDAG-NEXT: s_mov_b32 s2, s1
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt
+; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -357,7 +357,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
@@ -369,8 +369,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s2, s1
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
index deb97a9812b42..053cf0e1c6906 100644
--- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
+++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
@@ -236,8 +236,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB8_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[16:19], v16, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v19
; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v19
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
index 1e042d3b4a31f..41ffd01fc7e23 100644
--- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll
@@ -12,19 +12,19 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1)
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[0:1]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v12, s4
-; GFX942-NEXT: v_mov_b32_e32 v13, s5
+; GFX942-NEXT: v_mov_b32_e32 v8, s4
+; GFX942-NEXT: v_mov_b32_e32 v9, s5
; GFX942-NEXT: v_mov_b32_e32 v4, s6
; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_mov_b32_e32 v6, s7
; GFX942-NEXT: v_mov_b32_e32 v7, s7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13
+; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[10:13], v[8:9], v[4:7], v9
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dword v0, v11, s[2:3] offset:12
+; GFX942-NEXT: global_store_dword v0, v13, s[2:3] offset:12
; GFX942-NEXT: s_endpgm
entry:
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d2008be4fd32a..fc32bc644ddcd 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -8,34 +8,33 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: v_pk_mov_b32 v[46:47], 0, 0
-; CHECK-NEXT: flat_load_dword v42, v[46:47]
+; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; CHECK-NEXT: flat_load_dword v42, v[44:45]
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_load_dwordx4 s[64:67], s[34:35], 0x8
; CHECK-NEXT: s_load_dword s68, s[34:35], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s68, -1
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_cselect_b32 s5, s9, 0
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_cselect_b32 s6, s68, 0
-; CHECK-NEXT: v_mov_b32_e32 v57, s5
-; CHECK-NEXT: s_mov_b32 s5, s4
; CHECK-NEXT: s_add_u32 s50, s34, 48
-; CHECK-NEXT: v_accvgpr_write_b32 a33, s5
+; CHECK-NEXT: v_mov_b32_e32 v47, s5
+; CHECK-NEXT: s_mov_b32 s5, s4
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_accvgpr_write_b32 a32, s4
+; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
; CHECK-NEXT: s_mov_b32 s53, s14
-; CHECK-NEXT: v_mov_b32_e32 v56, s6
-; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[64:65], s[64:65] op_sel:[0,1]
+; CHECK-NEXT: v_mov_b32_e32 v46, s6
+; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[64:65], s[64:65] op_sel:[0,1]
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -48,15 +47,15 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_mov_b32_e32 v62, s66
-; CHECK-NEXT: v_mov_b32_e32 v63, s67
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
+; CHECK-NEXT: v_mov_b32_e32 v60, s66
+; CHECK-NEXT: v_mov_b32_e32 v61, s67
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59]
-; CHECK-NEXT: v_mov_b32_e32 v44, 0
-; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
+; CHECK-NEXT: flat_load_dwordx2 a[32:33], v[56:57]
+; CHECK-NEXT: v_mov_b32_e32 v58, 0
+; CHECK-NEXT: v_mov_b32_e32 v59, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -65,29 +64,31 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
-; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
+; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[58:59]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc
+; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s67
; CHECK-NEXT: v_mov_b32_e32 v0, s68
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v58, v0, s[0:3], 0 offen
+; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_4
; CHECK-NEXT: ; %bb.1: ; %LeafBlock5
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v42
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc
; CHECK-NEXT: ; %bb.2: ; %sw.bb17.i.i.i.i
-; CHECK-NEXT: v_mov_b32_e32 v44, 1
+; CHECK-NEXT: v_mov_b32_e32 v4, 1
; CHECK-NEXT: ; %bb.3: ; %Flow
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
; CHECK-NEXT: .LBB0_4: ; %Flow8
@@ -105,10 +106,10 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: ; %bb.7: ; %Flow7
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT: v_mov_b32_e32 v44, 0
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: .LBB0_8: ; %bb.1
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.9: ; %sw.bb.i.i.i.i.i
>From dd5267f379bf78fdc7c5bd2102ffbe4272d4fe8a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 4 Nov 2025 16:42:32 -0800
Subject: [PATCH 2/2] Update regressed tests
---
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 3131 +++++++++--------
.../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 2280 ++++++------
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 355 +-
3 files changed, 2900 insertions(+), 2866 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index ae83766cd6a4a..196958b74442f 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -644,10 +644,10 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[0:1]
+; GFX90A-NEXT: ; def a[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -659,7 +659,7 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -672,8 +672,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -758,7 +758,7 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB12_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB12_4
@@ -768,8 +768,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB12_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -926,12 +926,12 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -939,23 +939,23 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB14_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB14_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1016,12 +1016,12 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1029,23 +1029,23 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB15_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB15_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1220,7 +1220,7 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB17_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB17_4
@@ -1230,8 +1230,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1294,12 +1294,12 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1307,23 +1307,23 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB18_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB18_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB18_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1384,12 +1384,10 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_3
@@ -1406,14 +1404,14 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB19_2
; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1559,12 +1557,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1589,12 +1587,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1611,24 +1609,24 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1641,23 +1639,23 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1]
+; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB22_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1694,12 +1692,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1723,12 +1721,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1745,23 +1743,23 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1774,22 +1772,22 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1]
+; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB24_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1810,23 +1808,23 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1839,22 +1837,22 @@ define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1]
+; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB25_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1891,12 +1889,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1920,12 +1918,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1942,24 +1940,24 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1972,23 +1970,23 @@ define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1]
+; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB27_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2009,23 +2007,23 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2038,22 +2036,22 @@ define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1]
+; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB28_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2073,24 +2071,25 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
; GFX90A-NEXT: ;;#ASMSTART
@@ -2128,30 +2127,31 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33
-; GFX90A-NEXT: flat_load_dword v1, v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33
+; GFX90A-NEXT: flat_load_dword v1, v[2:3]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v0
+; GFX90A-NEXT: ; def a34
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0
+; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
@@ -2190,53 +2190,58 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a32
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a33, v1
; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def a34
+; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
@@ -2270,28 +2275,26 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_write_b32 a29, v29
; GFX950-NEXT: v_accvgpr_write_b32 a30, v30
; GFX950-NEXT: v_accvgpr_write_b32 a31, v31
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a32
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a33
-; GFX950-NEXT: flat_load_dword v1, v[4:5]
-; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v0
-; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a32
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a33
+; GFX950-NEXT: flat_load_dword v1, v[2:3]
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a34
; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v3, v1
-; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0
+; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB29_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a32, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
@@ -2330,24 +2333,25 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a32
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
@@ -2511,14 +2515,14 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB32_4: ; %Flow3
@@ -2572,14 +2576,14 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB32_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB32_4: ; %Flow3
@@ -2619,10 +2623,10 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB33_4
@@ -2632,40 +2636,40 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB33_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB33_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB33_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -2678,9 +2682,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB33_4
@@ -2690,37 +2694,37 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 {
; GFX950-NEXT: .LBB33_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB33_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB33_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB33_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
-; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB33_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -2759,14 +2763,14 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB34_4: ; %Flow3
@@ -2818,14 +2822,14 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB34_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB34_4: ; %Flow3
@@ -2864,9 +2868,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB35_4
@@ -2876,40 +2880,40 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB35_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB35_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB35_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -2920,9 +2924,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB35_4
@@ -2932,37 +2936,37 @@ define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: .LBB35_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB35_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB35_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB35_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
-; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB35_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -2981,9 +2985,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB36_4
@@ -2993,40 +2997,40 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-NEXT: .LBB36_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB36_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB36_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB36_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3037,9 +3041,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB36_4
@@ -3049,37 +3053,37 @@ define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 {
; GFX950-NEXT: .LBB36_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB36_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB36_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB36_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
-; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB36_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -3118,14 +3122,14 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB37_4: ; %Flow3
@@ -3177,14 +3181,14 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB37_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB37_4: ; %Flow3
@@ -3224,10 +3228,10 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB38_4
@@ -3237,40 +3241,40 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-NEXT: .LBB38_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB38_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB38_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB38_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3283,9 +3287,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB38_4
@@ -3295,37 +3299,37 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 {
; GFX950-NEXT: .LBB38_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB38_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB38_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB38_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
-; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB38_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -3344,9 +3348,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB39_4
@@ -3356,40 +3360,40 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB39_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB39_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB39_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -3400,9 +3404,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB39_4
@@ -3412,37 +3416,37 @@ define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 {
; GFX950-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB39_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB39_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB39_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3
-; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB39_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -5439,13 +5443,13 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -5468,12 +5472,12 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -5492,44 +5496,44 @@ define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2
-; GFX90A-NEXT: v_not_b32_e32 v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_nand_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB70_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6061,13 +6065,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6092,13 +6096,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6117,25 +6121,25 @@ define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2
-; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
+; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6145,26 +6149,26 @@ define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2
-; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
+; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
+; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB86_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
@@ -6192,12 +6196,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6220,12 +6224,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6241,20 +6245,20 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6267,20 +6271,20 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB88_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6402,35 +6406,35 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB90_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB90_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB90_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6587,35 +6591,35 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB92_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB92_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB92_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
-; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -6885,14 +6889,14 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB95_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB95_4: ; %Flow3
@@ -6950,14 +6954,14 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB95_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB95_4: ; %Flow3
@@ -6994,58 +6998,58 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_nand_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB96_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB96_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_and_b32_e32 v4, v7, v1
-; GFX90A-NEXT: v_and_b32_e32 v8, v6, v0
-; GFX90A-NEXT: v_not_b32_e32 v5, v4
-; GFX90A-NEXT: v_not_b32_e32 v4, v8
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4
+; GFX90A-NEXT: v_not_b32_e32 v1, v0
+; GFX90A-NEXT: v_not_b32_e32 v0, v8
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB96_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB96_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB96_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1
+; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0
-; GFX90A-NEXT: v_not_b32_e32 v0, v0
-; GFX90A-NEXT: v_not_b32_e32 v1, v1
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB96_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -7055,54 +7059,54 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB96_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB96_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_and_b32_e32 v2, v9, v1
-; GFX950-NEXT: v_and_b32_e32 v3, v8, v0
-; GFX950-NEXT: v_not_b32_e32 v7, v2
-; GFX950-NEXT: v_not_b32_e32 v6, v3
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
+; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
+; GFX950-NEXT: v_and_b32_e32 v8, v2, v4
+; GFX950-NEXT: v_not_b32_e32 v1, v0
+; GFX950-NEXT: v_not_b32_e32 v0, v8
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB96_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB96_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB96_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
-; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
-; GFX950-NEXT: v_not_b32_e32 v1, v1
-; GFX950-NEXT: v_not_b32_e32 v0, v0
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX950-NEXT: v_not_b32_e32 v3, v2
+; GFX950-NEXT: v_not_b32_e32 v2, v4
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB96_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -8492,14 +8496,14 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB111_4: ; %Flow3
@@ -8561,14 +8565,14 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB111_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB111_4: ; %Flow3
@@ -8607,60 +8611,60 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB112_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB112_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v0
-; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v1, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB112_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB112_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB112_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v4, v0
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v1, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB112_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8670,60 +8674,60 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB112_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB112_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB112_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB112_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB112_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc
-; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB112_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -8764,14 +8768,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB113_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB113_4: ; %Flow3
@@ -8831,14 +8835,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB113_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB113_4: ; %Flow3
@@ -8877,58 +8881,58 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB114_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
-; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB114_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB114_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB114_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v4, v2
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -8938,58 +8942,58 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB114_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB114_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB114_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB114_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB114_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB114_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -10009,10 +10013,10 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
@@ -10020,39 +10024,39 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_cbranch_execz .LBB128_6
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB128_3
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB128_3: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB128_5
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB128_5: ; %Flow1
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB128_6: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB128_8
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
+; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
@@ -10249,23 +10253,23 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[4:5]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB130_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
+; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7]
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
@@ -10274,19 +10278,19 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_cbranch_execnz .LBB130_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX950-NEXT: .LBB130_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB130_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
@@ -10406,31 +10410,31 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB132_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB132_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB132_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
@@ -10595,31 +10599,31 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB134_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB134_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB134_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
@@ -10729,16 +10733,15 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -10997,16 +11000,15 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -11237,12 +11239,12 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB139_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11274,20 +11276,20 @@ define void @flat_atomic_fadd_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB140_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB140_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11334,12 +11336,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11362,12 +11364,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11383,20 +11385,20 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB142_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11409,20 +11411,20 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB142_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11457,13 +11459,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11488,13 +11490,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11510,22 +11512,22 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB144_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11538,23 +11540,23 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v4, v2, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB144_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11589,13 +11591,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11620,13 +11622,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11642,22 +11644,22 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB146_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11670,23 +11672,23 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v4, v2, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB146_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11726,13 +11728,13 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11755,12 +11757,12 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11776,53 +11778,53 @@ define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB148_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB148_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -11862,13 +11864,13 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11891,12 +11893,12 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11912,53 +11914,53 @@ define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB150_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB150_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -12013,13 +12015,13 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12051,44 +12053,44 @@ define void @flat_atomic_fadd_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB152_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -12146,13 +12148,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12180,13 +12182,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12202,76 +12204,76 @@ define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB154_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB154_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -12316,13 +12318,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12350,13 +12352,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12372,76 +12374,76 @@ define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB156_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB156_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -12486,13 +12488,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12520,13 +12522,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12542,76 +12544,76 @@ define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB158_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB158_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -12661,13 +12663,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12695,13 +12697,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12717,81 +12719,81 @@ define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v4
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2
-; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
+; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB160_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v5
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4
+; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB160_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -12841,13 +12843,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12875,13 +12877,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12897,81 +12899,81 @@ define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v4
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2
-; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
+; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB162_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v5
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4
+; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB162_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -13306,27 +13308,28 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB171_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
-; GFX90A-NEXT: v_not_b32_e32 v2, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_and_b32_e32 v0, v1, v4
+; GFX90A-NEXT: v_not_b32_e32 v0, v0
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13336,26 +13339,27 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_bitop3_b32 v0, v1, v4, v1 bitop3:0x3f
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13375,24 +13379,24 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2
-; GFX90A-NEXT: v_not_b32_e32 v4, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB172_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -13400,20 +13404,20 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB172_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14066,28 +14070,29 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB189_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
-; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v4
+; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14097,29 +14102,30 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB189_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
-; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
+; GFX950-NEXT: v_sub_u32_e32 v0, v1, v4
+; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14139,25 +14145,25 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2
-; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
+; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB190_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -14168,26 +14174,26 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2
-; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
+; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
+; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB190_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
@@ -14202,26 +14208,27 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB191_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v4 clamp
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14231,26 +14238,27 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_sub_u32_e64 v0, v1, v4 clamp
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14267,20 +14275,20 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB192_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14294,20 +14302,20 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB192_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -14335,38 +14343,38 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: s_addc_u32 s5, s17, 0
; GFX90A-NEXT: s_cmp_eq_u32 s5, s7
+; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB193_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: s_cbranch_execz .LBB193_3
; GFX90A-NEXT: s_branch .LBB193_4
; GFX90A-NEXT: .LBB193_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3
; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: buffer_load_dword a2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword a3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use a[0:1]
+; GFX90A-NEXT: ; use a[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -15062,14 +15070,14 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB201_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_branch .LBB201_6
; GFX90A-NEXT: .LBB201_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
@@ -15126,14 +15134,14 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB201_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_branch .LBB201_6
; GFX950-NEXT: .LBB201_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
@@ -15174,50 +15182,50 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB202_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB202_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1
-; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0
-; GFX90A-NEXT: v_not_b32_e32 v7, v2
-; GFX90A-NEXT: v_not_b32_e32 v6, v3
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
+; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4
+; GFX90A-NEXT: v_not_b32_e32 v1, v0
+; GFX90A-NEXT: v_not_b32_e32 v0, v8
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB202_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB202_6
; GFX90A-NEXT: .LBB202_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB202_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0
-; GFX90A-NEXT: v_not_b32_e32 v0, v0
-; GFX90A-NEXT: v_not_b32_e32 v1, v1
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB202_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -15232,46 +15240,46 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB202_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB202_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_and_b32_e32 v2, v9, v1
-; GFX950-NEXT: v_and_b32_e32 v3, v8, v0
-; GFX950-NEXT: v_not_b32_e32 v7, v2
-; GFX950-NEXT: v_not_b32_e32 v6, v3
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
+; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
+; GFX950-NEXT: v_and_b32_e32 v8, v2, v4
+; GFX950-NEXT: v_not_b32_e32 v1, v0
+; GFX950-NEXT: v_not_b32_e32 v0, v8
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB202_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB202_6
; GFX950-NEXT: .LBB202_4:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB202_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
-; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
-; GFX950-NEXT: v_not_b32_e32 v1, v1
-; GFX950-NEXT: v_not_b32_e32 v0, v0
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX950-NEXT: v_not_b32_e32 v3, v2
+; GFX950-NEXT: v_not_b32_e32 v2, v4
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB202_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -16798,14 +16806,14 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB219_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_branch .LBB219_6
; GFX90A-NEXT: .LBB219_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
@@ -16866,14 +16874,14 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB219_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_branch .LBB219_6
; GFX950-NEXT: .LBB219_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
@@ -16916,52 +16924,52 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB220_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB220_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB220_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB220_6
; GFX90A-NEXT: .LBB220_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB220_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB220_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -16976,52 +16984,52 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB220_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB220_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB220_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB220_6
; GFX950-NEXT: .LBB220_4:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB220_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v1, vcc
-; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB220_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -17062,14 +17070,14 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB221_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_branch .LBB221_6
; GFX90A-NEXT: .LBB221_4:
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
@@ -17128,14 +17136,14 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB221_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_branch .LBB221_6
; GFX950-NEXT: .LBB221_4:
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
@@ -17178,50 +17186,50 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cbranch_vccz .LBB222_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB222_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB222_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB222_6
; GFX90A-NEXT: .LBB222_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB222_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB222_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -17236,50 +17244,50 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cbranch_vccz .LBB222_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB222_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB222_2
; GFX950-NEXT: ; %bb.3: ; %Flow
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB222_6
; GFX950-NEXT: .LBB222_4:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB222_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB222_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -19015,16 +19023,15 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
@@ -19275,16 +19282,15 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
@@ -19494,26 +19500,27 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB247_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB247_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19548,20 +19555,20 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB248_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB248_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19597,26 +19604,27 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB249_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19626,26 +19634,27 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_add_f16 v0, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19662,20 +19671,20 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB250_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19689,20 +19698,20 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB250_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19723,29 +19732,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19755,30 +19764,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19795,29 +19804,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB252_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v2
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -19825,30 +19834,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v4, v2, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB252_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v2
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
@@ -19863,29 +19872,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19895,30 +19904,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19935,29 +19944,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB254_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v2
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -19965,30 +19974,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v4, v2, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB254_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v2
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
@@ -20024,13 +20033,13 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20040,26 +20049,27 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v4, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20076,33 +20086,34 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v4
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB256_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -20110,20 +20121,20 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB256_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20165,13 +20176,13 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20181,26 +20192,27 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v4, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20217,33 +20229,34 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_pk_min_f16 v0, v1, v4
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB258_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -20251,20 +20264,20 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB258_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20321,13 +20334,13 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20362,45 +20375,45 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB260_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -20462,13 +20475,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20498,13 +20511,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20521,45 +20534,45 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB262_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -20567,33 +20580,33 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB262_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -20640,13 +20653,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20676,13 +20689,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20699,45 +20712,45 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB264_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -20745,33 +20758,33 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB264_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -20818,13 +20831,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20854,13 +20867,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20877,45 +20890,45 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB266_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -20923,33 +20936,33 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB266_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -21001,13 +21014,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -21037,13 +21050,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -21060,50 +21073,50 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2
-; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v8, v0, v4
+; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
+; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
+; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB268_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v5
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -21111,33 +21124,33 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4
+; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB268_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
@@ -21189,13 +21202,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -21225,13 +21238,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -21248,50 +21261,50 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2
-; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v8, v0, v4
+; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
+; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8
+; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB270_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v5
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -21299,33 +21312,33 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_minimum3_f32 v0, v0, v4, v4
+; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB270_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index c3531f16248e9..b6fe0c756a106 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -1062,12 +1062,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1092,12 +1092,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1114,24 +1114,24 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1144,23 +1144,23 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 {
; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off
+; GFX950-NEXT: global_load_dword v3, v[0:1], off
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB22_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1197,12 +1197,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1226,12 +1226,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1248,23 +1248,23 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0
; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1277,22 +1277,22 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off
+; GFX950-NEXT: global_load_dword v3, v[0:1], off
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB24_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1313,23 +1313,23 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0
; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1342,22 +1342,22 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off
+; GFX950-NEXT: global_load_dword v3, v[0:1], off
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB25_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1394,12 +1394,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1423,12 +1423,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1445,24 +1445,24 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0
; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1475,23 +1475,23 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off
+; GFX950-NEXT: global_load_dword v3, v[0:1], off
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB27_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1512,23 +1512,23 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0
; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1541,22 +1541,22 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0
; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_v_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off
+; GFX950-NEXT: global_load_dword v3, v[0:1], off
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB28_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1576,24 +1576,25 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1)
; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
; GFX90A-NEXT: ;;#ASMSTART
@@ -1631,30 +1632,31 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1)
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33
-; GFX90A-NEXT: global_load_dword v1, v[4:5], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33
+; GFX90A-NEXT: global_load_dword v1, v[2:3], off
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v0
+; GFX90A-NEXT: ; def a34
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0
+; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
@@ -1693,53 +1695,58 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a32
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a33, v1
; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; def a34
+; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
@@ -1773,28 +1780,26 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1)
; GFX950-NEXT: v_accvgpr_write_b32 a29, v29
; GFX950-NEXT: v_accvgpr_write_b32 a30, v30
; GFX950-NEXT: v_accvgpr_write_b32 a31, v31
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a32
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a33
-; GFX950-NEXT: global_load_dword v1, v[4:5], off
-; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v0
-; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a32
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a33
+; GFX950-NEXT: global_load_dword v1, v[2:3], off
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a34
; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v3, v1
-; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0
+; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB29_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a32, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a32, v0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
@@ -1833,24 +1838,25 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a32
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -2007,14 +2013,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2040,14 +2046,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB32_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2068,28 +2074,28 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2101,26 +2107,26 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB33_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -2151,14 +2157,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2182,14 +2188,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB34_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2209,27 +2215,27 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2239,26 +2245,26 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB35_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -2276,27 +2282,27 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2306,26 +2312,26 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB36_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -2356,14 +2362,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2387,14 +2393,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB37_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2415,28 +2421,28 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2448,26 +2454,26 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB38_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -2485,27 +2491,27 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2515,26 +2521,26 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3
-; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB39_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -3893,13 +3899,13 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -3922,12 +3928,12 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -3946,44 +3952,44 @@ define void @global_atomic_nand_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2
-; GFX90A-NEXT: v_not_b32_e32 v4, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4
+; GFX90A-NEXT: v_not_b32_e32 v2, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_nand_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB70_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4515,13 +4521,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4546,13 +4552,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -4571,25 +4577,25 @@ define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2
-; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4
+; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4599,26 +4605,26 @@ define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2
-; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
+; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4
+; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX950-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB86_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -4646,12 +4652,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4674,12 +4680,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -4695,20 +4701,20 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_usub_sat_i32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4721,20 +4727,20 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-LABEL: global_atomic_usub_sat_i32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB88_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5000,14 +5006,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB95_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5033,14 +5039,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB95_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5059,26 +5065,26 @@ define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB96_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_and_b32_e32 v4, v7, v3
-; GFX90A-NEXT: v_and_b32_e32 v8, v6, v2
-; GFX90A-NEXT: v_not_b32_e32 v5, v4
-; GFX90A-NEXT: v_not_b32_e32 v4, v8
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc
+; GFX90A-NEXT: v_and_b32_e32 v2, v5, v7
+; GFX90A-NEXT: v_and_b32_e32 v8, v4, v6
+; GFX90A-NEXT: v_not_b32_e32 v3, v2
+; GFX90A-NEXT: v_not_b32_e32 v2, v8
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB96_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5088,26 +5094,26 @@ define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB96_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_and_b32_e32 v4, v7, v3
-; GFX950-NEXT: v_and_b32_e32 v8, v6, v2
-; GFX950-NEXT: v_not_b32_e32 v5, v4
-; GFX950-NEXT: v_not_b32_e32 v4, v8
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, v5, v7
+; GFX950-NEXT: v_and_b32_e32 v8, v4, v6
+; GFX950-NEXT: v_not_b32_e32 v3, v2
+; GFX950-NEXT: v_not_b32_e32 v2, v8
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB96_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -5664,14 +5670,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5700,14 +5706,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB111_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5726,27 +5732,27 @@ define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
-; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
+; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB112_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5756,29 +5762,29 @@ define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB112_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3]
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB112_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -5809,14 +5815,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB113_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5844,14 +5850,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB113_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5870,26 +5876,26 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
-; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
+; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB114_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[4:5]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5899,28 +5905,28 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB114_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
-; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB114_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[4:5]
+; GFX950-NEXT: ; use v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -7379,12 +7385,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7407,12 +7413,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7428,20 +7434,20 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fsub_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB142_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7454,20 +7460,20 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-LABEL: global_atomic_fsub_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB142_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7502,13 +7508,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7533,13 +7539,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7555,22 +7561,22 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fmax_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB144_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7583,23 +7589,23 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-LABEL: global_atomic_fmax_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v4, v2, v3
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB144_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7634,13 +7640,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7665,13 +7671,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7687,22 +7693,22 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fmin_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2
; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB146_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7715,23 +7721,23 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-LABEL: global_atomic_fmin_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v4, v2, v3
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB146_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7771,13 +7777,13 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7800,12 +7806,12 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7821,53 +7827,53 @@ define void @global_atomic_fmaximum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fmaximum_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB148_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmaximum_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB148_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -7907,13 +7913,13 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7936,12 +7942,12 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7957,53 +7963,53 @@ define void @global_atomic_fminimum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fminimum_v2f16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB150_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fminimum_v2f16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3
-; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB150_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8058,13 +8064,13 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8096,44 +8102,44 @@ define void @global_atomic_fadd_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fadd_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB152_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8191,13 +8197,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8225,13 +8231,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8247,76 +8253,76 @@ define void @global_atomic_fsub_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fsub_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB154_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fsub_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB154_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -8361,13 +8367,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8395,13 +8401,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8417,76 +8423,76 @@ define void @global_atomic_fmax_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fmax_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB156_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmax_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB156_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -8531,13 +8537,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8565,13 +8571,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8587,76 +8593,76 @@ define void @global_atomic_fmin_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fmin_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB158_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmin_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB158_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -8706,13 +8712,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8740,13 +8746,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8762,81 +8768,81 @@ define void @global_atomic_fmaximum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v4
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
-; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2
-; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off offset:40 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4
+; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
+; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB160_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v5
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmaximum_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4
+; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB160_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -8886,13 +8892,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8920,13 +8926,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8942,81 +8948,81 @@ define void @global_atomic_fminimum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fminimum_v2bf16_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:40
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v4
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
-; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2
-; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off offset:40 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4
+; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc
+; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8
+; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB162_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v5
+; GFX90A-NEXT: ; use v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fminimum_v2bf16_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40
+; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v4
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4
-; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4
+; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2
+; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB162_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v4
+; GFX950-NEXT: ; use v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -9349,13 +9355,13 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9379,12 +9385,12 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9400,55 +9406,55 @@ define void @global_atomic_nand_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr)
; GFX90A-LABEL: global_atomic_nand_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v2, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v1
+; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_and_b32_e32 v2, v3, v1
-; GFX90A-NEXT: v_not_b32_e32 v2, v2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_and_b32_e32 v0, v1, v3
+; GFX90A-NEXT: v_not_b32_e32 v0, v0
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB172_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v2
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_nand_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_bitop3_b32 v4, v5, v2, v5 bitop3:0x3f
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_bitop3_b32 v0, v1, v3, v1 bitop3:0x3f
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB172_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -10078,13 +10084,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -10110,13 +10116,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10132,59 +10138,59 @@ define void @global_atomic_usub_cond_i32_saddr_ret_av_av(ptr addrspace(1) inreg
; GFX90A-LABEL: global_atomic_usub_cond_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v2, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v1
+; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v1
-; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX90A-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v3
+; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB190_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v2
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_usub_cond_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v2, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v1
+; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
-; GFX950-NEXT: v_sub_u32_e32 v2, v3, v1
-; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
+; GFX950-NEXT: v_sub_u32_e32 v0, v1, v3
+; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX950-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB190_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v2
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -10213,12 +10219,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -10242,12 +10248,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10263,54 +10269,54 @@ define void @global_atomic_usub_sat_i32_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX90A-LABEL: global_atomic_usub_sat_i32_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v2 clamp
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v3 clamp
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB192_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v1
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_usub_sat_i32_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_sub_u32_e64 v4, v5, v2 clamp
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_sub_u32_e64 v0, v1, v3 clamp
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB192_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -10658,14 +10664,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB201_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10692,14 +10698,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB201_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -10715,60 +10721,60 @@ define void @global_atomic_nand_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr)
; GFX90A-LABEL: global_atomic_nand_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB202_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1
-; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0
-; GFX90A-NEXT: v_not_b32_e32 v7, v2
-; GFX90A-NEXT: v_not_b32_e32 v6, v3
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc
+; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_and_b32_e32 v7, v2, v4
+; GFX90A-NEXT: v_not_b32_e32 v1, v0
+; GFX90A-NEXT: v_not_b32_e32 v0, v7
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB202_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_nand_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80
+; GFX950-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB202_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_and_b32_e32 v2, v9, v1
-; GFX950-NEXT: v_and_b32_e32 v3, v8, v0
-; GFX950-NEXT: v_not_b32_e32 v7, v2
-; GFX950-NEXT: v_not_b32_e32 v6, v3
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, v3, v5
+; GFX950-NEXT: v_and_b32_e32 v7, v2, v4
+; GFX950-NEXT: v_not_b32_e32 v1, v0
+; GFX950-NEXT: v_not_b32_e32 v0, v7
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB202_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -11425,14 +11431,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB219_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11462,14 +11468,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB219_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11485,64 +11491,64 @@ define void @global_atomic_usub_cond_i64_saddr_ret_av_av(ptr addrspace(1) inreg
; GFX90A-LABEL: global_atomic_usub_cond_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB220_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB220_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_usub_cond_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80
+; GFX950-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB220_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1]
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB220_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -11574,14 +11580,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB221_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11610,14 +11616,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB221_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11633,62 +11639,62 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX90A-LABEL: global_atomic_usub_sat_i64_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB222_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB222_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_usub_sat_i64_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80
+; GFX950-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB222_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3]
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
-; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB222_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -13197,12 +13203,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13226,12 +13232,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13247,54 +13253,54 @@ define void @global_atomic_fsub_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt
; GFX90A-LABEL: global_atomic_fsub_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_pk_add_f16 v0, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB250_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v1
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fsub_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_add_f16 v0, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB250_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -13324,13 +13330,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13356,13 +13362,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13378,59 +13384,59 @@ define void @global_atomic_fmax_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt
; GFX90A-LABEL: global_atomic_fmax_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0
; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_pk_max_f16 v1, v5, v5
-; GFX90A-NEXT: v_pk_max_f16 v4, v1, v2
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB252_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v1
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmax_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v4, v1, v2
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB252_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -13460,13 +13466,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13492,13 +13498,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13514,59 +13520,59 @@ define void @global_atomic_fmin_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt
; GFX90A-LABEL: global_atomic_fmin_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0
; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_pk_max_f16 v1, v5, v5
-; GFX90A-NEXT: v_pk_min_f16 v4, v1, v2
-; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB254_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v1
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmin_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v4, v1, v2
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB254_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -13601,13 +13607,13 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13631,12 +13637,12 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13652,61 +13658,61 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg
; GFX90A-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v1
+; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_max_f16 v3, v5, v1
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v3, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v3, v4, s8
-; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v3
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v5, v4, v0, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s8
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB256_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v2, v2
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v3, v3
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB256_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -13741,13 +13747,13 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13771,12 +13777,12 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13792,61 +13798,61 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg
; GFX90A-LABEL: global_atomic_fminimum_v2f16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX90A-NEXT: s_mov_b32 s8, 0x5040100
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v1
+; GFX90A-NEXT: ; def v3
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_min_f16 v3, v5, v1
-; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_sdwa v3, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_perm_b32 v4, v3, v4, s8
-; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_pk_min_f16 v0, v1, v3
+; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v5, v4, v0, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s8
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB258_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fminimum_v2f16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v1
-; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v2, v2
-; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v3, v3
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB258_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v1
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -13896,13 +13902,13 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13935,45 +13941,45 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX90A-LABEL: global_atomic_fadd_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v1
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB260_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -14033,13 +14039,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14068,13 +14074,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14090,78 +14096,78 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX90A-LABEL: global_atomic_fsub_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v1
-; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB262_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fsub_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT: v_sub_f32_e32 v3, v3, v1
-; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3
-; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX950-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB262_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -14207,13 +14213,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14242,13 +14248,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14264,78 +14270,78 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX90A-LABEL: global_atomic_fmax_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v1
-; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB264_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmax_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT: v_max_f32_e32 v3, v3, v1
-; GFX950-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3
-; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB264_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -14381,13 +14387,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14416,13 +14422,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14438,78 +14444,78 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX90A-LABEL: global_atomic_fmin_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v2
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v3, v3, v1
-; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB266_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v3
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmin_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT: v_min_f32_e32 v3, v3, v1
-; GFX950-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3
-; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX950-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB266_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -14560,13 +14566,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14595,13 +14601,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14617,83 +14623,83 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre
; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v4, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_max_f32_e32 v7, v4, v1
-; GFX90A-NEXT: v_max_f32_e32 v8, v6, v3
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v3
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_max_f32_e32 v7, v0, v3
+; GFX90A-NEXT: v_max_f32_e32 v8, v6, v5
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v5
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v4, v8, vcc
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB268_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT: v_maximum3_f32 v3, v3, v1, v1
-; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3
-; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT: v_maximum3_f32 v5, v5, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB268_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
@@ -14744,13 +14750,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14779,13 +14785,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14801,83 +14807,83 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre
; GFX90A-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: global_load_dword v4, v0, s[16:17] offset:40
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX90A-NEXT: v_min_f32_e32 v7, v4, v1
-; GFX90A-NEXT: v_min_f32_e32 v8, v6, v3
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v3
-; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v1
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_min_f32_e32 v7, v0, v3
+; GFX90A-NEXT: v_min_f32_e32 v8, v6, v5
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v5
+; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v4, v8, vcc
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
-; GFX90A-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[16:17] offset:40 glc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB270_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v4
+; GFX90A-NEXT: ; use v0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40
+; GFX950-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v2
+; GFX950-NEXT: ; def v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v3
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX950-NEXT: v_minimum3_f32 v3, v3, v1, v1
-; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3
-; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT: v_minimum3_f32 v5, v5, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0
+; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB270_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v3
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b045c761436de..5e18b469a4e88 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,34 +6,35 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX942-NEXT: v_mov_b32_e32 v2, 8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dword v3, v1, s[0:1]
; GFX942-NEXT: s_mov_b32 s4, 0xff0000
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v4, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_or_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX942-NEXT: v_and_or_b32 v3, v3, s4, v5
+; GFX942-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX942-NEXT: v_and_or_b32 v3, v3, s4, v4
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB0_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dword v1, v1, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT: global_load_dword v0, v0, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX942-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX942-NEXT: v_and_or_b32 v3, v1, s4, v2
+; GFX942-NEXT: v_and_or_b32 v3, v0, s4, v2
; GFX942-NEXT: .LBB0_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: global_store_byte_d16_hi v0, v3, s[6:7] offset:2
-; GFX942-NEXT: global_store_short v0, v3, s[6:7]
+; GFX942-NEXT: global_store_byte_d16_hi v1, v3, s[6:7] offset:2
+; GFX942-NEXT: global_store_short v1, v3, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -57,20 +58,21 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dword v2, v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB1_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dword v2, v1, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT: global_load_dword v2, v0, s[2:3]
; GFX942-NEXT: .LBB1_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX942-NEXT: global_store_dword v1, v2, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -94,24 +96,25 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB2_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX942-NEXT: .LBB2_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: global_store_byte v2, v1, s[6:7] offset:4
-; GFX942-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX942-NEXT: global_store_byte v3, v1, s[6:7] offset:4
+; GFX942-NEXT: global_store_dword v3, v0, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -135,20 +138,21 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB3_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[2:3]
; GFX942-NEXT: .LBB3_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -172,20 +176,21 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB4_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3]
; GFX942-NEXT: .LBB4_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -209,24 +214,25 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3]
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -250,9 +256,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
@@ -270,52 +276,53 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144
-; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v0, s[2:3] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v0, s[2:3] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v0, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v0, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v0, s[2:3] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v0, s[2:3]
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] offset:112
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96
+; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:96
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:80
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64
+; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:64
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:48
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32
+; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:32
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144
-; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128
+; GFX942-NEXT: global_store_dwordx4 v1, v[34:37], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[6:7] offset:240
+; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -400,6 +407,7 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB8_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
@@ -457,6 +465,7 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB9_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_waitcnt vmcnt(1)
@@ -507,85 +516,86 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_const:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16
+; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v4
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9]
; GFX942-NEXT: ; implicit-def: $vgpr2
-; GFX942-NEXT: ; implicit-def: $vgpr12
-; GFX942-NEXT: ; implicit-def: $vgpr10
; GFX942-NEXT: ; implicit-def: $vgpr13
-; GFX942-NEXT: ; implicit-def: $vgpr14
; GFX942-NEXT: ; implicit-def: $vgpr11
+; GFX942-NEXT: ; implicit-def: $vgpr14
; GFX942-NEXT: ; implicit-def: $vgpr15
+; GFX942-NEXT: ; implicit-def: $vgpr12
+; GFX942-NEXT: ; implicit-def: $vgpr16
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0
+; GFX942-NEXT: v_lshrrev_b32_e32 v5, 24, v1
+; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX942-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX942-NEXT: v_lshrrev_b32_e32 v8, 24, v0
+; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX942-NEXT: v_lshrrev_b32_e32 v10, 8, v0
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB10_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v4
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v4, 8
-; GFX942-NEXT: v_mov_b32_e32 v5, 7
-; GFX942-NEXT: v_mov_b32_e32 v6, 6
-; GFX942-NEXT: v_mov_b32_e32 v1, 5
-; GFX942-NEXT: v_mov_b32_e32 v7, 4
-; GFX942-NEXT: v_mov_b32_e32 v8, 3
-; GFX942-NEXT: v_mov_b32_e32 v9, 2
; GFX942-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-NEXT: v_mov_b32_e32 v10, 2
+; GFX942-NEXT: v_mov_b32_e32 v9, 3
+; GFX942-NEXT: v_mov_b32_e32 v8, 4
+; GFX942-NEXT: v_mov_b32_e32 v1, 5
+; GFX942-NEXT: v_mov_b32_e32 v7, 6
+; GFX942-NEXT: v_mov_b32_e32 v6, 7
+; GFX942-NEXT: v_mov_b32_e32 v5, 8
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v16, 24, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v15, 8, v3
+; GFX942-NEXT: v_lshrrev_b32_e32 v14, 24, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX942-NEXT: v_lshrrev_b32_e32 v13, 8, v2
; GFX942-NEXT: .LBB10_2: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB10_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9
-; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7
+; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v10
+; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v8
; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4
+; GFX942-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v5
; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7
; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13]
; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v12, v9
-; GFX942-NEXT: v_mov_b32_e32 v10, v8
-; GFX942-NEXT: v_mov_b32_e32 v13, v7
+; GFX942-NEXT: v_mov_b32_e32 v13, v10
+; GFX942-NEXT: v_mov_b32_e32 v11, v9
+; GFX942-NEXT: v_mov_b32_e32 v14, v8
; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v14, v6
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: v_mov_b32_e32 v15, v4
+; GFX942-NEXT: v_mov_b32_e32 v15, v7
+; GFX942-NEXT: v_mov_b32_e32 v12, v6
+; GFX942-NEXT: v_mov_b32_e32 v16, v5
; GFX942-NEXT: .LBB10_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12
-; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13
+; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v13
+; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14
; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15
+; GFX942-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v16
; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14
+; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v15
; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
@@ -617,30 +627,31 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[8:9]
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[4:5]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -858,16 +869,17 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-LABEL: v8i8_mfma_i8:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB14_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[10:11]
; GFX942-NEXT: .LBB14_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0
@@ -880,7 +892,7 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[12:13]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -908,16 +920,17 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_mfma_half:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37]
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB15_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39]
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v0, s[38:39]
; GFX942-NEXT: .LBB15_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0
@@ -960,14 +973,14 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41]
+; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[40:41] offset:112
+; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[40:41] offset:96
+; GFX942-NEXT: global_store_dwordx4 v1, a[20:23], s[40:41] offset:80
+; GFX942-NEXT: global_store_dwordx4 v1, a[16:19], s[40:41] offset:64
+; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[40:41] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[40:41] offset:32
+; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[40:41] offset:16
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[40:41]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -995,21 +1008,23 @@ define amdgpu_kernel void @v8i8_intrinsic(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB16_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX942-NEXT: .LBB16_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-branch-commits
mailing list